Knowledge Discovery and Data Mining¶

Project: Predicting the Price of California Wine¶

Course : CS-513 A¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, roc_auc_score
from mlxtend.plotting import category_scatter, plot_learning_curves, plot_decision_regions
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
data = pd.read_csv("Californa_Wine_Production_1980_2020.csv")
In [4]:
data
Out[4]:
Year CommodityCode CropName CountyCode County HarvestedAcres Yield(Unit/Acre) Production Price(Dollars/Unit) Unit Value(Dollars)
0 2020 216299 GRAPESWINE 1 Alameda 2530.0 5.14 13000.0 1497.69 Tons 19470000
1 2020 216299 GRAPESWINE 5 Amador 5360.0 2.31 12400.0 1318.31 Tons 16347000
2 2020 216299 GRAPESWINE 9 Calaveras 579.0 3.06 1770.0 1325.99 Tons 2347000
3 2020 216299 GRAPESWINE 11 Colusa 747.0 6.02 4500.0 684.67 Tons 3081000
4 2020 216299 GRAPESWINE 13 ContraCosta 1940.0 4.69 9090.0 751.27 Tons 6829000
... ... ... ... ... ... ... ... ... ... ... ...
1310 1980 216299 GRAPESWINE 95 Solano 1138.0 3.99 4544.0 315.00 TONS 1433300
1311 1980 216299 GRAPESWINE 97 Sonoma 23639.0 3.34 78941.0 506.00 TONS 39982000
1312 1980 216299 GRAPESWINE 99 Stanislaus 17950.0 8.80 157900.0 183.00 TONS 28848000
1313 1980 216299 GRAPESWINE 107 Tulare 15159.0 8.88 134600.0 170.00 TONS 22902000
1314 1980 216299 GRAPESWINE 113 Yolo 566.0 8.70 4924.0 274.00 TONS 1351000

1315 rows × 11 columns

In [5]:
data.rename(columns = {'Yield(Unit/Acre)':'Yield', 'Price(Dollars/Unit)':'Price', 'Value(Dollars)':'Value'}, inplace = True)
In [6]:
data.Unit = "TONS"
In [7]:
data
Out[7]:
Year CommodityCode CropName CountyCode County HarvestedAcres Yield Production Price Unit Value
0 2020 216299 GRAPESWINE 1 Alameda 2530.0 5.14 13000.0 1497.69 TONS 19470000
1 2020 216299 GRAPESWINE 5 Amador 5360.0 2.31 12400.0 1318.31 TONS 16347000
2 2020 216299 GRAPESWINE 9 Calaveras 579.0 3.06 1770.0 1325.99 TONS 2347000
3 2020 216299 GRAPESWINE 11 Colusa 747.0 6.02 4500.0 684.67 TONS 3081000
4 2020 216299 GRAPESWINE 13 ContraCosta 1940.0 4.69 9090.0 751.27 TONS 6829000
... ... ... ... ... ... ... ... ... ... ... ...
1310 1980 216299 GRAPESWINE 95 Solano 1138.0 3.99 4544.0 315.00 TONS 1433300
1311 1980 216299 GRAPESWINE 97 Sonoma 23639.0 3.34 78941.0 506.00 TONS 39982000
1312 1980 216299 GRAPESWINE 99 Stanislaus 17950.0 8.80 157900.0 183.00 TONS 28848000
1313 1980 216299 GRAPESWINE 107 Tulare 15159.0 8.88 134600.0 170.00 TONS 22902000
1314 1980 216299 GRAPESWINE 113 Yolo 566.0 8.70 4924.0 274.00 TONS 1351000

1315 rows × 11 columns

In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1315 entries, 0 to 1314
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1315 non-null   int64  
 1   CommodityCode   1315 non-null   int64  
 2   CropName        1315 non-null   object 
 3   CountyCode      1315 non-null   int64  
 4   County          1315 non-null   object 
 5   HarvestedAcres  1302 non-null   float64
 6   Yield           1266 non-null   float64
 7   Production      1278 non-null   float64
 8   Price           1278 non-null   float64
 9   Unit            1315 non-null   object 
 10  Value           1315 non-null   int64  
dtypes: float64(4), int64(4), object(3)
memory usage: 113.1+ KB
In [9]:
data.describe()
Out[9]:
Year CommodityCode CountyCode HarvestedAcres Yield Production Price Value
count 1315.000000 1315.0 1315.000000 1302.000000 1266.000000 1.278000e+03 1278.000000 1.315000e+03
mean 2001.171103 216299.0 59.051711 14546.443164 5.176288 1.023638e+05 928.027363 5.961252e+07
std 11.751631 0.0 31.083778 20576.581018 3.267309 1.711560e+05 1244.949783 1.237376e+08
min 1980.000000 216299.0 1.000000 3.000000 0.060000 2.300000e+01 74.000000 1.150000e+04
25% 1991.000000 216299.0 33.000000 805.750000 2.940000 3.259750e+03 335.827500 1.600500e+06
50% 2002.000000 216299.0 61.000000 3699.000000 4.040000 1.740000e+04 723.500000 1.220210e+07
75% 2011.000000 216299.0 83.000000 23884.500000 7.185000 1.298860e+05 1236.887500 6.340450e+07
max 2020.000000 216299.0 115.000000 128613.000000 25.000000 1.040100e+06 36342.070000 2.030002e+09
In [10]:
data.describe(include='all')
Out[10]:
Year CommodityCode CropName CountyCode County HarvestedAcres Yield Production Price Unit Value
count 1315.000000 1315.0 1315 1315.000000 1315 1302.000000 1266.000000 1.278000e+03 1278.000000 1315 1.315000e+03
unique NaN NaN 1 NaN 42 NaN NaN NaN NaN 1 NaN
top NaN NaN GRAPESWINE NaN Alameda NaN NaN NaN NaN TONS NaN
freq NaN NaN 1315 NaN 41 NaN NaN NaN NaN 1315 NaN
mean 2001.171103 216299.0 NaN 59.051711 NaN 14546.443164 5.176288 1.023638e+05 928.027363 NaN 5.961252e+07
std 11.751631 0.0 NaN 31.083778 NaN 20576.581018 3.267309 1.711560e+05 1244.949783 NaN 1.237376e+08
min 1980.000000 216299.0 NaN 1.000000 NaN 3.000000 0.060000 2.300000e+01 74.000000 NaN 1.150000e+04
25% 1991.000000 216299.0 NaN 33.000000 NaN 805.750000 2.940000 3.259750e+03 335.827500 NaN 1.600500e+06
50% 2002.000000 216299.0 NaN 61.000000 NaN 3699.000000 4.040000 1.740000e+04 723.500000 NaN 1.220210e+07
75% 2011.000000 216299.0 NaN 83.000000 NaN 23884.500000 7.185000 1.298860e+05 1236.887500 NaN 6.340450e+07
max 2020.000000 216299.0 NaN 115.000000 NaN 128613.000000 25.000000 1.040100e+06 36342.070000 NaN 2.030002e+09
In [11]:
data.dtypes
Out[11]:
Year                int64
CommodityCode       int64
CropName           object
CountyCode          int64
County             object
HarvestedAcres    float64
Yield             float64
Production        float64
Price             float64
Unit               object
Value               int64
dtype: object
In [12]:
data = data.drop(columns=['CommodityCode','CropName','Unit','Value'])
In [13]:
data.duplicated().any()
Out[13]:
False
In [14]:
data.duplicated().value_counts()
Out[14]:
False    1315
dtype: int64
In [15]:
data.isnull().sum()
Out[15]:
Year               0
CountyCode         0
County             0
HarvestedAcres    13
Yield             49
Production        37
Price             37
dtype: int64
In [16]:
data.columns[data.isnull().any()]
Out[16]:
Index(['HarvestedAcres', 'Yield', 'Production', 'Price'], dtype='object')
In [17]:
data.isnull().value_counts()
Out[17]:
Year   CountyCode  County  HarvestedAcres  Yield  Production  Price
False  False       False   False           False  False       False    1266
                                           True   True        True       36
                           True            True   False       False      12
                                                  True        True        1
dtype: int64
In [18]:
data[data.HarvestedAcres.isna()].sort_values(['County','Year'])
Out[18]:
Year CountyCode County HarvestedAcres Yield Production Price
1225 1983 57 Nevada NaN NaN 23.0 500.0
1193 1984 57 Nevada NaN NaN 46.0 496.0
1163 1985 57 Nevada NaN NaN 62.0 405.0
1134 1986 57 Nevada NaN NaN 125.0 435.0
1103 1987 57 Nevada NaN NaN 238.0 465.0
1072 1988 57 Nevada NaN NaN 223.0 502.0
1043 1989 57 Nevada NaN NaN 473.0 656.0
1015 1990 57 Nevada NaN NaN 527.0 928.0
592 2004 81 SanMateo NaN NaN NaN NaN
760 1999 105 Trinity NaN NaN 140.0 1204.0
728 2000 105 Trinity NaN NaN 140.0 1204.0
698 2001 105 Trinity NaN NaN 140.0 1204.0
666 2002 105 Trinity NaN NaN 140.0 1204.0
In [19]:
data[data.Yield.isna()].sort_values(['County','Year'])
Out[19]:
Year CountyCode County HarvestedAcres Yield Production Price
1225 1983 57 Nevada NaN NaN 23.0 500.0
1193 1984 57 Nevada NaN NaN 46.0 496.0
1163 1985 57 Nevada NaN NaN 62.0 405.0
1134 1986 57 Nevada NaN NaN 125.0 435.0
1103 1987 57 Nevada NaN NaN 238.0 465.0
1072 1988 57 Nevada NaN NaN 223.0 502.0
1043 1989 57 Nevada NaN NaN 473.0 656.0
1015 1990 57 Nevada NaN NaN 527.0 928.0
16 2020 57 Nevada 408.0 NaN NaN NaN
936 1993 81 SanMateo 48.0 NaN NaN NaN
907 1994 81 SanMateo 54.0 NaN NaN NaN
877 1995 81 SanMateo 56.0 NaN NaN NaN
848 1996 81 SanMateo 56.0 NaN NaN NaN
819 1997 81 SanMateo 52.0 NaN NaN NaN
786 1998 81 SanMateo 54.0 NaN NaN NaN
754 1999 81 SanMateo 40.0 NaN NaN NaN
723 2000 81 SanMateo 45.0 NaN NaN NaN
692 2001 81 SanMateo 44.0 NaN NaN NaN
660 2002 81 SanMateo 64.0 NaN NaN NaN
626 2003 81 SanMateo 60.0 NaN NaN NaN
592 2004 81 SanMateo NaN NaN NaN NaN
558 2005 81 SanMateo 88.0 NaN NaN NaN
524 2006 81 SanMateo 89.0 NaN NaN NaN
490 2007 81 SanMateo 98.0 NaN NaN NaN
456 2008 81 SanMateo 96.0 NaN NaN NaN
421 2009 81 SanMateo 135.0 NaN NaN NaN
386 2010 81 SanMateo 137.0 NaN NaN NaN
350 2011 81 SanMateo 135.0 NaN NaN NaN
314 2012 81 SanMateo 131.0 NaN NaN NaN
278 2013 81 SanMateo 153.0 NaN NaN NaN
243 2014 81 SanMateo 154.0 NaN NaN NaN
209 2015 81 SanMateo 165.0 NaN NaN NaN
173 2016 81 SanMateo 152.0 NaN NaN NaN
137 2017 81 SanMateo 164.0 NaN NaN NaN
100 2018 81 SanMateo 126.0 NaN NaN NaN
62 2019 81 SanMateo 181.0 NaN NaN NaN
25 2020 81 SanMateo 176.0 NaN NaN NaN
793 1998 105 Trinity 85.0 NaN NaN NaN
760 1999 105 Trinity NaN NaN 140.0 1204.0
728 2000 105 Trinity NaN NaN 140.0 1204.0
698 2001 105 Trinity NaN NaN 140.0 1204.0
666 2002 105 Trinity NaN NaN 140.0 1204.0
632 2003 105 Trinity 114.0 NaN NaN NaN
598 2004 105 Trinity 114.0 NaN NaN NaN
564 2005 105 Trinity 114.0 NaN NaN NaN
530 2006 105 Trinity 114.0 NaN NaN NaN
496 2007 105 Trinity 114.0 NaN NaN NaN
462 2008 105 Trinity 114.0 NaN NaN NaN
428 2009 105 Trinity 114.0 NaN NaN NaN
In [20]:
data[data.Production.isna()].sort_values(['County','Year'])
Out[20]:
Year CountyCode County HarvestedAcres Yield Production Price
16 2020 57 Nevada 408.0 NaN NaN NaN
936 1993 81 SanMateo 48.0 NaN NaN NaN
907 1994 81 SanMateo 54.0 NaN NaN NaN
877 1995 81 SanMateo 56.0 NaN NaN NaN
848 1996 81 SanMateo 56.0 NaN NaN NaN
819 1997 81 SanMateo 52.0 NaN NaN NaN
786 1998 81 SanMateo 54.0 NaN NaN NaN
754 1999 81 SanMateo 40.0 NaN NaN NaN
723 2000 81 SanMateo 45.0 NaN NaN NaN
692 2001 81 SanMateo 44.0 NaN NaN NaN
660 2002 81 SanMateo 64.0 NaN NaN NaN
626 2003 81 SanMateo 60.0 NaN NaN NaN
592 2004 81 SanMateo NaN NaN NaN NaN
558 2005 81 SanMateo 88.0 NaN NaN NaN
524 2006 81 SanMateo 89.0 NaN NaN NaN
490 2007 81 SanMateo 98.0 NaN NaN NaN
456 2008 81 SanMateo 96.0 NaN NaN NaN
421 2009 81 SanMateo 135.0 NaN NaN NaN
386 2010 81 SanMateo 137.0 NaN NaN NaN
350 2011 81 SanMateo 135.0 NaN NaN NaN
314 2012 81 SanMateo 131.0 NaN NaN NaN
278 2013 81 SanMateo 153.0 NaN NaN NaN
243 2014 81 SanMateo 154.0 NaN NaN NaN
209 2015 81 SanMateo 165.0 NaN NaN NaN
173 2016 81 SanMateo 152.0 NaN NaN NaN
137 2017 81 SanMateo 164.0 NaN NaN NaN
100 2018 81 SanMateo 126.0 NaN NaN NaN
62 2019 81 SanMateo 181.0 NaN NaN NaN
25 2020 81 SanMateo 176.0 NaN NaN NaN
793 1998 105 Trinity 85.0 NaN NaN NaN
632 2003 105 Trinity 114.0 NaN NaN NaN
598 2004 105 Trinity 114.0 NaN NaN NaN
564 2005 105 Trinity 114.0 NaN NaN NaN
530 2006 105 Trinity 114.0 NaN NaN NaN
496 2007 105 Trinity 114.0 NaN NaN NaN
462 2008 105 Trinity 114.0 NaN NaN NaN
428 2009 105 Trinity 114.0 NaN NaN NaN
In [21]:
data[data.Price.isna()].sort_values(['County','Year'])
Out[21]:
Year CountyCode County HarvestedAcres Yield Production Price
16 2020 57 Nevada 408.0 NaN NaN NaN
936 1993 81 SanMateo 48.0 NaN NaN NaN
907 1994 81 SanMateo 54.0 NaN NaN NaN
877 1995 81 SanMateo 56.0 NaN NaN NaN
848 1996 81 SanMateo 56.0 NaN NaN NaN
819 1997 81 SanMateo 52.0 NaN NaN NaN
786 1998 81 SanMateo 54.0 NaN NaN NaN
754 1999 81 SanMateo 40.0 NaN NaN NaN
723 2000 81 SanMateo 45.0 NaN NaN NaN
692 2001 81 SanMateo 44.0 NaN NaN NaN
660 2002 81 SanMateo 64.0 NaN NaN NaN
626 2003 81 SanMateo 60.0 NaN NaN NaN
592 2004 81 SanMateo NaN NaN NaN NaN
558 2005 81 SanMateo 88.0 NaN NaN NaN
524 2006 81 SanMateo 89.0 NaN NaN NaN
490 2007 81 SanMateo 98.0 NaN NaN NaN
456 2008 81 SanMateo 96.0 NaN NaN NaN
421 2009 81 SanMateo 135.0 NaN NaN NaN
386 2010 81 SanMateo 137.0 NaN NaN NaN
350 2011 81 SanMateo 135.0 NaN NaN NaN
314 2012 81 SanMateo 131.0 NaN NaN NaN
278 2013 81 SanMateo 153.0 NaN NaN NaN
243 2014 81 SanMateo 154.0 NaN NaN NaN
209 2015 81 SanMateo 165.0 NaN NaN NaN
173 2016 81 SanMateo 152.0 NaN NaN NaN
137 2017 81 SanMateo 164.0 NaN NaN NaN
100 2018 81 SanMateo 126.0 NaN NaN NaN
62 2019 81 SanMateo 181.0 NaN NaN NaN
25 2020 81 SanMateo 176.0 NaN NaN NaN
793 1998 105 Trinity 85.0 NaN NaN NaN
632 2003 105 Trinity 114.0 NaN NaN NaN
598 2004 105 Trinity 114.0 NaN NaN NaN
564 2005 105 Trinity 114.0 NaN NaN NaN
530 2006 105 Trinity 114.0 NaN NaN NaN
496 2007 105 Trinity 114.0 NaN NaN NaN
462 2008 105 Trinity 114.0 NaN NaN NaN
428 2009 105 Trinity 114.0 NaN NaN NaN
In [22]:
data[data['County'] == 'Nevada']
Out[22]:
Year CountyCode County HarvestedAcres Yield Production Price
16 2020 57 Nevada 408.0 NaN NaN NaN
53 2019 57 Nevada 416.0 3.92 1630.0 1564.42
91 2018 57 Nevada 416.0 4.52 1880.0 1284.57
128 2017 57 Nevada 416.0 3.65 1520.0 1287.50
164 2016 57 Nevada 417.0 3.33 1390.0 1386.33
200 2015 57 Nevada 352.0 1.87 659.0 1349.01
234 2014 57 Nevada 342.0 3.86 1320.0 1341.67
269 2013 57 Nevada 424.0 2.92 1240.0 1521.77
305 2012 57 Nevada 312.0 2.75 858.0 1393.94
341 2011 57 Nevada 343.0 5.19 1780.0 1101.12
377 2010 57 Nevada 236.0 3.12 736.0 1402.17
412 2009 57 Nevada 248.0 10.44 2590.0 542.63
447 2008 57 Nevada 402.0 2.96 1189.0 1280.82
481 2007 57 Nevada 385.0 4.23 1629.0 1280.91
515 2006 57 Nevada 358.0 3.71 1329.0 1170.28
549 2005 57 Nevada 350.0 3.72 1302.0 1294.55
583 2004 57 Nevada 349.0 4.11 1434.0 1527.68
617 2003 57 Nevada 356.0 3.67 1307.0 1350.80
651 2002 57 Nevada 404.0 4.32 1746.0 1053.00
683 2001 57 Nevada 348.0 3.00 1043.0 1192.00
715 2000 57 Nevada 303.0 3.57 1082.0 1096.00
746 1999 57 Nevada 201.0 4.49 902.0 1031.00
778 1998 57 Nevada 201.0 2.72 546.0 1105.00
811 1997 57 Nevada 201.0 5.29 1063.0 1099.00
841 1996 57 Nevada 201.0 4.26 856.0 1068.00
869 1995 57 Nevada 201.0 2.46 495.0 1018.00
899 1994 57 Nevada 174.0 3.92 682.0 701.00
928 1993 57 Nevada 174.0 3.42 595.0 713.00
957 1992 57 Nevada 174.0 4.93 858.0 816.00
986 1991 57 Nevada 174.0 4.94 860.0 661.00
1015 1990 57 Nevada NaN NaN 527.0 928.00
1043 1989 57 Nevada NaN NaN 473.0 656.00
1072 1988 57 Nevada NaN NaN 223.0 502.00
1103 1987 57 Nevada NaN NaN 238.0 465.00
1134 1986 57 Nevada NaN NaN 125.0 435.00
1163 1985 57 Nevada NaN NaN 62.0 405.00
1193 1984 57 Nevada NaN NaN 46.0 496.00
1225 1983 57 Nevada NaN NaN 23.0 500.00
In [23]:
data[data['County'] == 'SanMateo']
Out[23]:
Year CountyCode County HarvestedAcres Yield Production Price
25 2020 81 SanMateo 176.0 NaN NaN NaN
62 2019 81 SanMateo 181.0 NaN NaN NaN
100 2018 81 SanMateo 126.0 NaN NaN NaN
137 2017 81 SanMateo 164.0 NaN NaN NaN
173 2016 81 SanMateo 152.0 NaN NaN NaN
209 2015 81 SanMateo 165.0 NaN NaN NaN
243 2014 81 SanMateo 154.0 NaN NaN NaN
278 2013 81 SanMateo 153.0 NaN NaN NaN
314 2012 81 SanMateo 131.0 NaN NaN NaN
350 2011 81 SanMateo 135.0 NaN NaN NaN
386 2010 81 SanMateo 137.0 NaN NaN NaN
421 2009 81 SanMateo 135.0 NaN NaN NaN
456 2008 81 SanMateo 96.0 NaN NaN NaN
490 2007 81 SanMateo 98.0 NaN NaN NaN
524 2006 81 SanMateo 89.0 NaN NaN NaN
558 2005 81 SanMateo 88.0 NaN NaN NaN
592 2004 81 SanMateo NaN NaN NaN NaN
626 2003 81 SanMateo 60.0 NaN NaN NaN
660 2002 81 SanMateo 64.0 NaN NaN NaN
692 2001 81 SanMateo 44.0 NaN NaN NaN
723 2000 81 SanMateo 45.0 NaN NaN NaN
754 1999 81 SanMateo 40.0 NaN NaN NaN
786 1998 81 SanMateo 54.0 NaN NaN NaN
819 1997 81 SanMateo 52.0 NaN NaN NaN
848 1996 81 SanMateo 56.0 NaN NaN NaN
877 1995 81 SanMateo 56.0 NaN NaN NaN
907 1994 81 SanMateo 54.0 NaN NaN NaN
936 1993 81 SanMateo 48.0 NaN NaN NaN
In [24]:
data[data['County'] == 'Trinity']
Out[24]:
Year CountyCode County HarvestedAcres Yield Production Price
71 2019 105 Trinity 44.0 1.93 85.0 1729.41
108 2018 105 Trinity 44.0 1.93 85.0 1729.41
144 2017 105 Trinity 44.0 1.93 85.0 1729.41
180 2016 105 Trinity 44.0 1.93 85.0 1729.41
285 2013 105 Trinity 114.0 2.33 266.0 1214.29
321 2012 105 Trinity 114.0 2.33 266.0 1214.29
357 2011 105 Trinity 114.0 2.33 266.0 1214.29
393 2010 105 Trinity 114.0 2.33 266.0 1214.29
428 2009 105 Trinity 114.0 NaN NaN NaN
462 2008 105 Trinity 114.0 NaN NaN NaN
496 2007 105 Trinity 114.0 NaN NaN NaN
530 2006 105 Trinity 114.0 NaN NaN NaN
564 2005 105 Trinity 114.0 NaN NaN NaN
598 2004 105 Trinity 114.0 NaN NaN NaN
632 2003 105 Trinity 114.0 NaN NaN NaN
666 2002 105 Trinity NaN NaN 140.0 1204.00
698 2001 105 Trinity NaN NaN 140.0 1204.00
728 2000 105 Trinity NaN NaN 140.0 1204.00
760 1999 105 Trinity NaN NaN 140.0 1204.00
793 1998 105 Trinity 85.0 NaN NaN NaN
In [25]:
data.dropna(inplace=True)
In [26]:
data.isnull().sum()
Out[26]:
Year              0
CountyCode        0
County            0
HarvestedAcres    0
Yield             0
Production        0
Price             0
dtype: int64
In [27]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266 entries, 0 to 1314
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1266 non-null   int64  
 1   CountyCode      1266 non-null   int64  
 2   County          1266 non-null   object 
 3   HarvestedAcres  1266 non-null   float64
 4   Yield           1266 non-null   float64
 5   Production      1266 non-null   float64
 6   Price           1266 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 79.1+ KB

Discretizing the data into classes.

In [28]:
data
Out[28]:
Year CountyCode County HarvestedAcres Yield Production Price
0 2020 1 Alameda 2530.0 5.14 13000.0 1497.69
1 2020 5 Amador 5360.0 2.31 12400.0 1318.31
2 2020 9 Calaveras 579.0 3.06 1770.0 1325.99
3 2020 11 Colusa 747.0 6.02 4500.0 684.67
4 2020 13 ContraCosta 1940.0 4.69 9090.0 751.27
... ... ... ... ... ... ... ...
1310 1980 95 Solano 1138.0 3.99 4544.0 315.00
1311 1980 97 Sonoma 23639.0 3.34 78941.0 506.00
1312 1980 99 Stanislaus 17950.0 8.80 157900.0 183.00
1313 1980 107 Tulare 15159.0 8.88 134600.0 170.00
1314 1980 113 Yolo 566.0 8.70 4924.0 274.00

1266 rows × 7 columns

In [29]:
price_data = data.Price
In [30]:
# len(price_data)
price_data.size
Out[30]:
1266
In [31]:
price_data.sort_values(ascending=True)
Out[31]:
1156       74.00
1155       90.00
1158       94.00
1188       95.00
1185       97.00
          ...   
370      5125.60
127      5287.55
90       5614.05
52       5862.26
522     36342.07
Name: Price, Length: 1266, dtype: float64
In [32]:
data.sort_values('Price')
Out[32]:
Year CountyCode County HarvestedAcres Yield Production Price
1156 1985 31 Kings 1117.0 11.90 13292.0 74.00
1155 1985 29 Kern 33255.0 8.48 282000.0 90.00
1158 1985 39 Madera 40100.0 7.54 302354.0 94.00
1188 1984 39 Madera 36010.0 6.87 247389.0 95.00
1185 1984 29 Kern 34861.0 6.95 242160.0 97.00
... ... ... ... ... ... ... ...
370 2010 41 Marin 186.0 1.11 207.0 5125.60
127 2017 55 Napa 43600.0 3.26 142000.0 5287.55
90 2018 55 Napa 43400.0 4.26 185000.0 5614.05
52 2019 55 Napa 44200.0 3.62 160000.0 5862.26
522 2006 77 SanJoaquin 92501.0 0.06 5610.0 36342.07

1266 rows × 7 columns

In [33]:
data['Price'].value_counts()
Out[33]:
400.00     9
193.00     5
249.00     5
170.00     5
1263.16    5
          ..
1046.00    1
1139.48    1
253.16     1
273.03     1
274.00     1
Name: Price, Length: 1057, dtype: int64
In [34]:
data['Price'].value_counts().sort_index()
Out[34]:
74.00       1
90.00       1
94.00       1
95.00       1
97.00       1
           ..
5125.60     1
5287.55     1
5614.05     1
5862.26     1
36342.07    1
Name: Price, Length: 1057, dtype: int64
In [35]:
plt.figure(figsize=(200, 6))
price_data.value_counts().plot(kind = 'bar', title = 'Counts')
Out[35]:
<AxesSubplot: title={'center': 'Counts'}>
In [36]:
data['Price_Classification'] = pd.cut(x=data['Price'], bins=[0, 250, 1000, 50000], labels=[0, 1, 2])
data['Price_Categories'] = pd.cut(x=data['Price'], bins=[0, 250, 1000, 50000], labels=["Low", "Medium", "High"])
In [37]:
data.Price_Categories.size
Out[37]:
1266
In [38]:
data.Price_Categories.value_counts()
Out[38]:
Medium    557
High      496
Low       213
Name: Price_Categories, dtype: int64
In [39]:
data.Price_Categories.value_counts().plot(kind='bar', title='Price Categories')
plt.xticks(rotation=0)
# plt.grid(True)
Out[39]:
(array([0, 1, 2]),
 [Text(0, 0, 'Medium'), Text(1, 0, 'High'), Text(2, 0, 'Low')])
In [40]:
data.Price_Categories.value_counts().plot(kind='pie', title='Price Categories')
Out[40]:
<AxesSubplot: title={'center': 'Price Categories'}, ylabel='Price_Categories'>
In [41]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266 entries, 0 to 1314
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Year                  1266 non-null   int64   
 1   CountyCode            1266 non-null   int64   
 2   County                1266 non-null   object  
 3   HarvestedAcres        1266 non-null   float64 
 4   Yield                 1266 non-null   float64 
 5   Production            1266 non-null   float64 
 6   Price                 1266 non-null   float64 
 7   Price_Classification  1266 non-null   category
 8   Price_Categories      1266 non-null   category
dtypes: category(2), float64(4), int64(2), object(1)
memory usage: 81.9+ KB

Year (From 1980 to 2020)
CountyCode
County
HarvestedAcres
Yield (Tons/Acres)
Production (Tons)
Price (Dollar/Ton)

$Formula: $
$Yield = \frac{Production}{HarvestedAcres}$

In [42]:
plt.figure(figsize=(12, 6))
data.boxplot()
Out[42]:
<AxesSubplot: >
In [43]:
plt.figure(figsize=(12, 6))
sns.boxplot(data)
Out[43]:
<AxesSubplot: >
In [44]:
data.plot(kind='box', subplots=True, layout=(3, 3), figsize=(20, 18))
Out[44]:
Year                 AxesSubplot(0.125,0.653529;0.227941x0.226471)
CountyCode        AxesSubplot(0.398529,0.653529;0.227941x0.226471)
HarvestedAcres    AxesSubplot(0.672059,0.653529;0.227941x0.226471)
Yield                AxesSubplot(0.125,0.381765;0.227941x0.226471)
Production        AxesSubplot(0.398529,0.381765;0.227941x0.226471)
Price             AxesSubplot(0.672059,0.381765;0.227941x0.226471)
dtype: object
In [45]:
sns.heatmap(data.corr(), annot=True)
plt.tight_layout()
In [46]:
sns.pairplot(data, hue='Price_Categories', palette='tab10')
Out[46]:
<seaborn.axisgrid.PairGrid at 0x1a7563c9c90>
In [47]:
year_group_df = data.groupby(['Year']).mean()
In [48]:
year_group_df
Out[48]:
CountyCode HarvestedAcres Yield Production Price
Year
1980 58.363636 11648.181818 5.423182 81257.045455 311.181818
1981 58.363636 11361.045455 5.122727 74517.454545 365.909091
1982 56.724138 10141.793103 5.392759 72930.310345 333.379310
1983 56.290323 10020.483871 4.612903 59887.290323 328.935484
1984 57.466667 10810.733333 4.643000 64995.700000 314.333333
1985 58.142857 11566.142857 5.326786 78133.321429 294.035714
1986 56.448276 11338.310345 5.053793 70793.103448 320.862069
1987 57.466667 10421.900000 4.572000 64398.266667 363.933333
1988 60.500000 11734.607143 5.097857 82406.928571 461.928571
1989 60.500000 12157.607143 5.382143 82915.714286 543.321429
1990 59.857143 11866.464286 5.058214 82378.000000 532.928571
1991 59.758621 11439.965517 5.310345 79712.275862 551.827586
1992 59.758621 14532.551724 5.518276 115578.896552 585.103448
1993 58.928571 13971.035714 5.612857 110075.071429 572.750000
1994 57.482759 13144.931034 5.321724 93701.448276 581.448276
1995 57.482759 14098.068966 5.145862 106062.379310 668.551724
1996 55.666667 15990.074074 5.302963 112124.962963 812.814815
1997 57.413793 16439.206897 6.615172 143187.896552 893.068966
1998 60.161290 15704.064516 5.067419 109923.645161 940.096774
1999 58.466667 17762.833333 4.806333 107824.433333 1035.866667
2000 59.500000 16633.214286 5.092143 132022.071429 1039.821429
2001 60.333333 16330.033333 4.642333 107666.200000 1023.766667
2002 58.375000 17533.781250 4.671875 121630.875000 977.625000
2003 58.375000 17039.093750 4.500312 107848.437500 972.457188
2004 58.375000 18094.218750 4.444375 113704.875000 985.163750
2005 58.375000 18063.125000 5.443750 139594.375000 996.616250
2006 58.375000 17290.531250 4.634062 92065.218750 2148.246562
2007 58.375000 17021.812500 4.664688 115152.156250 1156.425625
2008 58.375000 17666.000000 4.590625 121311.625000 1268.173125
2009 59.303030 16797.393939 5.448485 121407.212121 1118.793636
2010 60.371429 15781.371429 5.030857 113799.657143 1102.573429
2011 59.000000 15903.027778 5.267222 106007.888889 1135.948889
2012 57.400000 17577.571429 6.063143 130217.885714 1162.566857
2013 57.400000 15539.371429 5.696286 119421.685714 1229.459143
2014 56.151515 16944.030303 5.313030 117627.787879 1239.334848
2015 54.882353 16403.617647 4.743529 103503.529412 1284.156765
2016 56.314286 16588.028571 5.207714 111014.857143 1380.673429
2017 56.314286 16300.485714 5.417714 110198.771429 1397.224000
2018 57.611111 15638.500000 5.603056 113497.027778 1447.444722
2019 58.729730 15292.594595 5.691351 105145.243243 1467.862703
2020 57.882353 16282.823529 5.570294 101469.411765 1396.155294
In [49]:
year_group_df.index
Out[49]:
Int64Index([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
            1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
            2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
            2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020],
           dtype='int64', name='Year')

Exploratory Data Analysis:¶

In [50]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=data.groupby(['Year']).count().index, y=data.groupby(['Year']).count()['County'])
plt.ylabel(ylabel='Number of counties producing wine')
plt.title(label='Fluctuation in the number of wine-producing counties across time')
plt.show()
In [51]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Production'])
plt.title(label='Fluctuations in Production over Time')
plt.show()
In [52]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Production')
plt.title(label='Fluctuations in Production over Time')
plt.show()
In [53]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Production')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Production in Tons')
plt.show()
In [54]:
data.sort_values('Production').tail(5)
Out[54]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
293 2012 19 Fresno 86700.0 10.38 900000.0 385.48 1 Medium
329 2011 19 Fresno 68900.0 13.13 905000.0 335.50 1 Medium
640 2002 19 Fresno 85701.0 10.83 928100.0 141.00 0 Low
801 1997 19 Fresno 87220.0 11.22 978930.0 225.00 0 Low
436 2008 19 Fresno 69631.0 14.94 1040100.0 250.93 1 Medium
In [55]:
sns.barplot(x='Year', y='Production', data=data.sort_values('Production').tail(5))
plt.title(label='Top Five Wine Production Years')
Out[55]:
Text(0.5, 1.0, 'Top Five Wine Production Years')
In [56]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['HarvestedAcres'])
plt.title(label='Fluctuations in Harvested Acres over Time')
plt.show()
In [57]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='HarvestedAcres')
plt.title(label='Fluctuations in Harvested Acres over Time')
plt.show()
In [58]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Harvested Area in Acres')
plt.show()
In [59]:
data.sort_values('HarvestedAcres').tail(5)
Out[59]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
171 2016 77 SanJoaquin 98000.0 7.31 716000.0 594.67 1 Medium
135 2017 77 SanJoaquin 98100.0 6.79 666000.0 593.91 1 Medium
241 2014 77 SanJoaquin 102000.0 8.00 816000.0 590.00 1 Medium
312 2012 77 SanJoaquin 109000.0 8.18 892000.0 605.72 1 Medium
736 1999 19 Fresno 128613.0 5.40 693910.0 237.00 0 Low
In [60]:
sns.barplot(x='Year', y='HarvestedAcres', data=data.sort_values('HarvestedAcres').tail(5))
plt.title(label='Top Five Grapes Harvest Years')
Out[60]:
Text(0.5, 1.0, 'Top Five Grapes Harvest Years')
In [61]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Yield'])
plt.title(label='Fluctuations in Yield over Time')
plt.show()
In [62]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Yield')
plt.title(label='Fluctuations in Yield over Time')
plt.show()
In [63]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Annual Grapes Yield in Tons per Acres')
plt.show()
In [64]:
data.sort_values('Yield').tail(5)
Out[64]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
145 2017 107 Tulare 9450.0 17.35 164000.0 308.00 1 Medium
109 2018 107 Tulare 8410.0 17.72 149000.0 314.00 1 Medium
72 2019 107 Tulare 6260.0 18.69 117000.0 290.00 1 Medium
338 2011 51 Mono 4.0 23.50 94.0 670.21 1 Medium
302 2012 51 Mono 3.0 25.00 75.0 653.33 1 Medium
In [65]:
sns.barplot(x='Year', y='Yield', data=data.sort_values('Yield').tail(5))
plt.title(label='Top Five Wine Yield Years')
Out[65]:
Text(0.5, 1.0, 'Top Five Wine Yield Years')
In [66]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=year_group_df.index, y=year_group_df['Price'])
plt.title(label='Fluctuations in Price over Time')
plt.show()
In [67]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='Year', y='Price')
plt.title(label='Fluctuations in Price over Time')
plt.show()
In [68]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='Year', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price per Ton over Time')
plt.show()
In [69]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Year', hue='Price_Categories', data=data)
plt.xticks(rotation=90)
plt.title(label='Price Categories over Time')
plt.show()
In [70]:
import plotly.express as px
fig = px.line(data, x='Year', y='Price', color='Price_Categories', symbol="Price_Categories")
fig.show()
In [71]:
data.sort_values('Price').tail(5)
Out[71]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
370 2010 41 Marin 186.0 1.11 207.0 5125.60 2 High
127 2017 55 Napa 43600.0 3.26 142000.0 5287.55 2 High
90 2018 55 Napa 43400.0 4.26 185000.0 5614.05 2 High
52 2019 55 Napa 44200.0 3.62 160000.0 5862.26 2 High
522 2006 77 SanJoaquin 92501.0 0.06 5610.0 36342.07 2 High
In [72]:
sns.barplot(x='Year', y='Price', data=data.sort_values('Price').tail(5))
plt.title(label='Top Five Wine Price Years')
Out[72]:
Text(0.5, 1.0, 'Top Five Wine Price Years')
In [73]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Production')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Production over County')
plt.show()
In [74]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Production')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Production over County')
plt.show()
In [75]:
data.sort_values('Production').tail(1)
Out[75]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
436 2008 19 Fresno 69631.0 14.94 1040100.0 250.93 1 Medium
In [76]:
sns.barplot(x='County', y='Production', data=data.sort_values('Production').tail(1))
plt.title(label='Top Wine Production County')
Out[76]:
Text(0.5, 1.0, 'Top Wine Production County')
In [77]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in HarvestedAcres over County')
plt.show()
In [78]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='HarvestedAcres')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in HarvestedAcres over County')
plt.show()
In [79]:
data.sort_values('HarvestedAcres').tail(1)
Out[79]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
736 1999 19 Fresno 128613.0 5.4 693910.0 237.0 0 Low
In [80]:
sns.barplot(x='County', y='HarvestedAcres', data=data.sort_values('HarvestedAcres').tail(1))
plt.title(label='Top Grapes Harvest County')
Out[80]:
Text(0.5, 1.0, 'Top Grapes Harvest County')
In [81]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Yield over County')
plt.show()
In [82]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Yield')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Yield over County')
plt.show()
In [83]:
data.sort_values('Yield').tail(1)
Out[83]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
302 2012 51 Mono 3.0 25.0 75.0 653.33 1 Medium
In [84]:
sns.barplot(x='County', y='Yield', data=data.sort_values('Yield').tail(1))
plt.title(label='Top Wine Yield County')
Out[84]:
Text(0.5, 1.0, 'Top Wine Yield County')
In [85]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x='County', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price over County')
plt.show()
In [86]:
plt.figure(figsize=(12, 6))
sns.color_palette("tab10")
sns.barplot(data, x='County', y='Price')
plt.xticks(rotation=90)
plt.title(label='Fluctuations in Price over County')
plt.show()
In [87]:
plt.figure(figsize=(12, 6))
sns.countplot(x='County', hue='Price_Categories', data=data)
plt.xticks(rotation=90)
plt.title(label='Price Categories over County')
plt.show()
In [88]:
data.sort_values('Price').tail(1)
Out[88]:
Year CountyCode County HarvestedAcres Yield Production Price Price_Classification Price_Categories
522 2006 77 SanJoaquin 92501.0 0.06 5610.0 36342.07 2 High
In [89]:
sns.barplot(x='County', y='Price', data=data.sort_values('Price').tail(1))
plt.title(label='Top Wine Price County')
Out[89]:
Text(0.5, 1.0, 'Top Wine Price County')
In [90]:
fig = category_scatter(x='Yield', y='Production', label_col='Price_Categories', data=data, legend_loc='upper left')
plt.xlabel('Yield')
plt.ylabel('Production')
plt.title(label='Price Categories over Yield and Production')
# plt.grid(True)
plt.show()
In [91]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data, x='Yield', y='Production', hue="Price_Categories")
plt.title(label='Price Categories over Yield and Production')
Out[91]:
Text(0.5, 1.0, 'Price Categories over Yield and Production')
In [92]:
sns.scatterplot(data, x='CountyCode', y='Year', hue="Price_Categories")
plt.title(label='Price Categories over CountyCode and Year')
Out[92]:
Text(0.5, 1.0, 'Price Categories over CountyCode and Year')
In [93]:
plt.figure(figsize=(12, 6))
sns.lineplot(data, x="CountyCode", y="Year", hue="Price_Categories")
plt.title(label='Price Categories over CountyCode and Year')
Out[93]:
Text(0.5, 1.0, 'Price Categories over CountyCode and Year')

Setting up the target variable.

In [94]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266 entries, 0 to 1314
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Year                  1266 non-null   int64   
 1   CountyCode            1266 non-null   int64   
 2   County                1266 non-null   object  
 3   HarvestedAcres        1266 non-null   float64 
 4   Yield                 1266 non-null   float64 
 5   Production            1266 non-null   float64 
 6   Price                 1266 non-null   float64 
 7   Price_Classification  1266 non-null   category
 8   Price_Categories      1266 non-null   category
dtypes: category(2), float64(4), int64(2), object(1)
memory usage: 81.9+ KB
In [95]:
features = ['Year', 'CountyCode', 'HarvestedAcres', 'Yield', 'Production']
x = data[features]
y = data.Price_Classification
In [96]:
x
Out[96]:
Year CountyCode HarvestedAcres Yield Production
0 2020 1 2530.0 5.14 13000.0
1 2020 5 5360.0 2.31 12400.0
2 2020 9 579.0 3.06 1770.0
3 2020 11 747.0 6.02 4500.0
4 2020 13 1940.0 4.69 9090.0
... ... ... ... ... ...
1310 1980 95 1138.0 3.99 4544.0
1311 1980 97 23639.0 3.34 78941.0
1312 1980 99 17950.0 8.80 157900.0
1313 1980 107 15159.0 8.88 134600.0
1314 1980 113 566.0 8.70 4924.0

1266 rows × 5 columns

In [97]:
x.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266 entries, 0 to 1314
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1266 non-null   int64  
 1   CountyCode      1266 non-null   int64  
 2   HarvestedAcres  1266 non-null   float64
 3   Yield           1266 non-null   float64
 4   Production      1266 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 59.3 KB
In [98]:
y = y.astype('int64')
y.info()
<class 'pandas.core.series.Series'>
Int64Index: 1266 entries, 0 to 1314
Series name: Price_Classification
Non-Null Count  Dtype
--------------  -----
1266 non-null   int64
dtypes: int64(1)
memory usage: 19.8 KB
In [99]:
y.value_counts()
Out[99]:
1    557
2    496
0    213
Name: Price_Classification, dtype: int64
In [100]:
y.value_counts().plot(kind = 'bar', title = 'Price Classification Counts')
plt.xticks(rotation=0)
Out[100]:
(array([0, 1, 2]), [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '0')])
In [101]:
sns.heatmap(x.corr(), annot=True)
plt.tight_layout()
In [102]:
scaler = MinMaxScaler()
# scaler.fit(x)
# scaler.transform(x)
x = scaler.fit_transform(x)
In [103]:
x
Out[103]:
array([[1.        , 0.        , 0.01964855, 0.20368885, 0.01246272],
       [1.        , 0.03508772, 0.04165306, 0.09021652, 0.01188583],
       [1.        , 0.07017544, 0.00447866, 0.12028869, 0.00166529],
       ...,
       [0.        , 0.85964912, 0.13954591, 0.35044106, 0.15178134],
       [0.        , 0.92982456, 0.11784465, 0.35364876, 0.12937883],
       [0.        , 0.98245614, 0.00437758, 0.34643144, 0.0046978 ]])
In [104]:
sns.boxplot(x)
Out[104]:
<AxesSubplot: >

Split dataset into 80% for training and 20% for testing.

In [105]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
In [106]:
print("Shape of original dataset:", data.shape)
print("Shape of input training set:", x_train.shape)
print("Shape of output training set:", y_train.shape)
print("Shape of input testing set:", x_test.shape)
print("Shape of output testing set:", y_test.shape)
Shape of original dataset: (1266, 9)
Shape of input training set: (1012, 5)
Shape of output training set: (1012,)
Shape of input testing set: (254, 5)
Shape of output testing set: (254,)
In [107]:
y_test.size
Out[107]:
254
In [108]:
y_test.value_counts()
Out[108]:
2    110
1    109
0     35
Name: Price_Classification, dtype: int64

Performance Functions for Models

In [109]:
true_class_names = ["True Low", "True Medium", "True High"]
predicted_class_names = ["Predicted Low", "Predicted Medium", "Predicted High"]
In [110]:
def Confusion_Matrix_Plotter(cm, dtype):
    cm_df = pd.DataFrame(cm, index = true_class_names, columns = predicted_class_names)
    if dtype == 1:
        sns.heatmap(cm_df, annot=True, fmt="d")
        plt.title('Confusion Matrix')
    else:
        sns.heatmap(cm_df, annot=True)
        plt.title('Confusion Matrix Percentage')
    plt.tight_layout()
In [111]:
def Compute_Error(cm):
    n11, n12, n13, n21, n22, n23, n31, n32, n33 = cm.ravel()
    TP_C1 = n11
    TN_C1 = n22 + n33
    FP_C1 = n21 + n31
    FN_C1 = n12 + n13
    Type1_Error_C1 = FP_C1
    Type2_Error_C1 = FN_C1
    print("Type1_Error_LowPrice:", Type1_Error_C1)
    print("Type2_Error_LowPrice:", Type2_Error_C1)
    TP_C2 = n22
    TN_C2 = n11 + n33
    FP_C2 = n12 + n32
    FN_C2 = n21 + n23
    Type1_Error_C2 = FP_C2
    Type2_Error_C2 = FN_C2
    print("Type1_Error_MediumPrice:", Type1_Error_C2)
    print("Type2_Error_MediumPrice:", Type2_Error_C2)
    TP_C3 = n33
    TN_C3 = n11 + n22
    FP_C3 = n13 + n23
    FN_C3 = n31 + n32
    Type1_Error_C3 = FP_C3
    Type2_Error_C3 = FN_C3
    print("Type1_Error_HighPrice:", Type1_Error_C3)
    print("Type2_Error_HighPrice:", Type2_Error_C3)
    return Type1_Error_C1, Type2_Error_C1, Type1_Error_C2, Type2_Error_C2, Type1_Error_C3, Type2_Error_C3
In [112]:
def Compute_Sensitivity(TP, FN):
    sensitivity_test = (TP / (TP + FN))
    return sensitivity_test
In [113]:
def Compute_Specificity(TN, FP):
    specificity_test = (TN / (FP + TN))
    return specificity_test

Naive Bayes¶

In [114]:
nb = GaussianNB()
nb.fit(x_train, y_train)
Out[114]:
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
In [115]:
# Model Scores on training and test set
print("Training Set score:", nb.score(x_train, y_train))
print("Test Set score:", nb.score(x_test, y_test))
Training Set score: 0.708498023715415
Test Set score: 0.6929133858267716
In [116]:
# Prediction on Testing Data
y_pred_nb = nb.predict(x_test)
nb_accuracy = metrics.accuracy_score(y_test, y_pred_nb)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_nb))
Accuracy: 0.6929133858267716
In [117]:
# Prediction on Training Data
y_pred2_nb = nb.predict(x_train)
nb_taccuracy = metrics.accuracy_score(y_train, y_pred2_nb)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_nb))
Accuracy: 0.708498023715415
In [118]:
confusion_matrix_nb = metrics.confusion_matrix(y_test, y_pred_nb)
confusion_matrix_nb
Out[118]:
array([[18, 15,  2],
       [12, 64, 33],
       [ 0, 16, 94]], dtype=int64)
In [119]:
Confusion_Matrix_Plotter(confusion_matrix_nb, 1)
In [120]:
confusion_matrix_nb_percent = confusion_matrix_nb.astype('float') / confusion_matrix_nb.sum(axis=1)[:, np.newaxis]
confusion_matrix_nb_percent
Out[120]:
array([[0.51428571, 0.42857143, 0.05714286],
       [0.11009174, 0.58715596, 0.30275229],
       [0.        , 0.14545455, 0.85454545]])
In [121]:
Confusion_Matrix_Plotter(confusion_matrix_nb_percent, 0)
In [122]:
print(classification_report(y_test, y_pred_nb, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.60      0.51      0.55        35
Medium Price       0.67      0.59      0.63       109
  High Price       0.73      0.85      0.79       110

    accuracy                           0.69       254
   macro avg       0.67      0.65      0.66       254
weighted avg       0.69      0.69      0.69       254

In [123]:
nb_t1_l, nb_t2_l, nb_t1_m, nb_t2_m, nb_t1_h, nb_t2_h = Compute_Error(confusion_matrix_nb)
Type1_Error_LowPrice: 12
Type2_Error_LowPrice: 17
Type1_Error_MediumPrice: 31
Type2_Error_MediumPrice: 45
Type1_Error_HighPrice: 35
Type2_Error_HighPrice: 16
In [124]:
nb_pl, nb_pm, nb_ph = precision_score(y_test, y_pred_nb, average=None)
In [125]:
nb_rl, nb_rm, nb_rh = recall_score(y_test, y_pred_nb, average=None)
In [126]:
nb_fl, nb_fm, nb_fh = f1_score(y_test, y_pred_nb, average=None)
In [127]:
cv_nb = cross_val_score(nb, x_train, y_train, cv = 10, scoring='accuracy')
cv_nb
Out[127]:
array([0.68627451, 0.65686275, 0.69306931, 0.79207921, 0.68316832,
       0.62376238, 0.78217822, 0.73267327, 0.7029703 , 0.67326733])
In [128]:
cv_nb_m = cv_nb.mean()
print("Cross Validation Score:", cv_nb_m)
Cross Validation Score: 0.7026305571733643
In [129]:
plot_learning_curves(x_train, y_train, x_test, y_test, nb)
plt.show()

Support Vector Classification¶

In [130]:
svc = SVC()
svc.fit(x_train, y_train)
Out[130]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [131]:
# Model Scores on training and test set
print("Training Set score:", svc.score(x_train, y_train))
print("Test Set score:", svc.score(x_test, y_test))
Training Set score: 0.8320158102766798
Test Set score: 0.7992125984251969
In [132]:
# Prediction on Testing Data
y_pred_svc = svc.predict(x_test)
svc_accuracy = metrics.accuracy_score(y_test, y_pred_svc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svc))
Accuracy: 0.7992125984251969
In [133]:
# Prediction on Training Data
y_pred2_svc = svc.predict(x_train)
svc_taccuracy = metrics.accuracy_score(y_train, y_pred2_svc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_svc))
Accuracy: 0.8320158102766798
In [134]:
confusion_matrix_svc = metrics.confusion_matrix(y_test, y_pred_svc)
confusion_matrix_svc
Out[134]:
array([[24, 10,  1],
       [ 8, 84, 17],
       [ 0, 15, 95]], dtype=int64)
In [135]:
Confusion_Matrix_Plotter(confusion_matrix_svc, 1)
In [136]:
confusion_matrix_svc_percent = confusion_matrix_svc.astype('float') / confusion_matrix_svc.sum(axis=1)[:, np.newaxis]
confusion_matrix_svc_percent
Out[136]:
array([[0.68571429, 0.28571429, 0.02857143],
       [0.0733945 , 0.7706422 , 0.1559633 ],
       [0.        , 0.13636364, 0.86363636]])
In [137]:
Confusion_Matrix_Plotter(confusion_matrix_svc_percent, 0)
In [138]:
print(classification_report(y_test, y_pred_svc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.75      0.69      0.72        35
Medium Price       0.77      0.77      0.77       109
  High Price       0.84      0.86      0.85       110

    accuracy                           0.80       254
   macro avg       0.79      0.77      0.78       254
weighted avg       0.80      0.80      0.80       254

In [139]:
svc_t1_l, svc_t2_l, svc_t1_m, svc_t2_m, svc_t1_h, svc_t2_h = Compute_Error(confusion_matrix_svc)
Type1_Error_LowPrice: 8
Type2_Error_LowPrice: 11
Type1_Error_MediumPrice: 25
Type2_Error_MediumPrice: 25
Type1_Error_HighPrice: 18
Type2_Error_HighPrice: 15
In [140]:
svc_pl, svc_pm, svc_ph = precision_score(y_test, y_pred_svc, average=None)

svc_rl, svc_rm, svc_rh = recall_score(y_test, y_pred_svc, average=None)

svc_fl, svc_fm, svc_fh = f1_score(y_test, y_pred_svc, average=None)
In [141]:
cv_svc = cross_val_score(svc, x_train, y_train, cv = 10, scoring='accuracy')
cv_svc
Out[141]:
array([0.83333333, 0.78431373, 0.77227723, 0.88118812, 0.79207921,
       0.81188119, 0.84158416, 0.87128713, 0.79207921, 0.83168317])
In [142]:
cv_svc_m = cv_svc.mean()
print("Cross Validation Score:", cv_svc_m)
Cross Validation Score: 0.8211706464764124
In [143]:
plot_learning_curves(x_train, y_train, x_test, y_test, svc)
plt.show()

Applying GridSearchCV.

In [144]:
svc.get_params()
Out[144]:
{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}
In [145]:
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 
              'degree': [1, 2, 3, 4, 5], 'gamma': [1, 0.1, 0.01, 0.001]}
In [146]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'sigmoid'], 
              'degree': [1, 2, 3], 'gamma': [1, 0.1, 0.01]}
In [147]:
svc_gscv = GridSearchCV(SVC(), param_grid, scoring = 'accuracy', cv = 10, refit=True, verbose=1)
In [148]:
svc_gscv.fit(x_train, y_train)
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Out[148]:
GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'degree': [1, 2, 3],
                         'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf', 'sigmoid']},
             scoring='accuracy', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'degree': [1, 2, 3],
                         'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf', 'sigmoid']},
             scoring='accuracy', verbose=1)
SVC()
SVC()
In [149]:
# Model Scores on training and test set
print("Training Set score:", svc_gscv.score(x_train, y_train))
print("Test Set score:", svc_gscv.score(x_test, y_test))
Training Set score: 0.83399209486166
Test Set score: 0.8110236220472441
In [150]:
#printing best parameter after tuning
print("GridSearch CV Best Parameters:", svc_gscv.best_params_) 

#printing how our model looks after hyper-parameter tuning
print("\nGridSearch CV Best Estimator:", svc_gscv.best_estimator_)

# best score achieved during the GridSearchCV
print("\nGridSearch CV Best score:", svc_gscv.best_score_)

cv_svc_gscv_b = svc_gscv.best_score_
GridSearch CV Best Parameters: {'C': 10, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}

GridSearch CV Best Estimator: SVC(C=10, degree=1, gamma=1)

GridSearch CV Best score: 0.8270821199767037
In [151]:
# Prediction on Testing Data
y_pred_svc = svc_gscv.predict(x_test)
svc_gscv_accuracy = metrics.accuracy_score(y_test, y_pred_svc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svc))
Accuracy: 0.8110236220472441
In [152]:
# Prediction on Training Data
y_pred2_svc = svc_gscv.predict(x_train)
svc_gscv_taccuracy = metrics.accuracy_score(y_train, y_pred2_svc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_svc))
Accuracy: 0.83399209486166
In [153]:
confusion_matrix_svc = metrics.confusion_matrix(y_test, y_pred_svc)
confusion_matrix_svc
Out[153]:
array([[25,  9,  1],
       [ 8, 84, 17],
       [ 0, 13, 97]], dtype=int64)
In [154]:
Confusion_Matrix_Plotter(confusion_matrix_svc, 1)
In [155]:
confusion_matrix_svc_percent = confusion_matrix_svc.astype('float') / confusion_matrix_svc.sum(axis=1)[:, np.newaxis]
confusion_matrix_svc_percent
Out[155]:
array([[0.71428571, 0.25714286, 0.02857143],
       [0.0733945 , 0.7706422 , 0.1559633 ],
       [0.        , 0.11818182, 0.88181818]])
In [156]:
Confusion_Matrix_Plotter(confusion_matrix_svc_percent, 0)
In [157]:
print(classification_report(y_test, y_pred_svc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.76      0.71      0.74        35
Medium Price       0.79      0.77      0.78       109
  High Price       0.84      0.88      0.86       110

    accuracy                           0.81       254
   macro avg       0.80      0.79      0.79       254
weighted avg       0.81      0.81      0.81       254

In [158]:
svc_gs_t1_l, svc_gs_t2_l, svc_gs_t1_m, svc_gs_t2_m, svc_gs_t1_h, svc_gs_t2_h = Compute_Error(confusion_matrix_svc)
Type1_Error_LowPrice: 8
Type2_Error_LowPrice: 10
Type1_Error_MediumPrice: 22
Type2_Error_MediumPrice: 25
Type1_Error_HighPrice: 18
Type2_Error_HighPrice: 13
In [159]:
svc_gs_pl, svc_gs_pm, svc_gs_ph = precision_score(y_test, y_pred_svc, average=None)

svc_gs_rl, svc_gs_rm, svc_gs_rh = recall_score(y_test, y_pred_svc, average=None)

svc_gs_fl, svc_gs_fm, svc_gs_fh = f1_score(y_test, y_pred_svc, average=None)

Logistic Regression¶

In [160]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
Out[160]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [161]:
# Model Scores on training and test set
print("Training Set score:", logreg.score(x_train, y_train))
print("Test Set score:", logreg.score(x_test, y_test))
Training Set score: 0.7529644268774703
Test Set score: 0.7283464566929134
In [162]:
# Prediction on Testing Data
y_pred_lr = logreg.predict(x_test)
lr_accuracy = metrics.accuracy_score(y_test, y_pred_lr)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_lr))
Accuracy: 0.7283464566929134
In [163]:
# Prediction on Training Data
y_pred2_lr = logreg.predict(x_train)
lr_taccuracy = metrics.accuracy_score(y_train, y_pred2_lr)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_lr))
Accuracy: 0.7529644268774703
In [164]:
confusion_matrix_lr = metrics.confusion_matrix(y_test, y_pred_lr)
confusion_matrix_lr
Out[164]:
array([[16, 18,  1],
       [ 3, 87, 19],
       [ 0, 28, 82]], dtype=int64)
In [165]:
Confusion_Matrix_Plotter(confusion_matrix_lr, 1)
In [166]:
confusion_matrix_lr_percent = confusion_matrix_lr.astype('float') / confusion_matrix_lr.sum(axis=1)[:, np.newaxis]
confusion_matrix_lr_percent
Out[166]:
array([[0.45714286, 0.51428571, 0.02857143],
       [0.02752294, 0.79816514, 0.17431193],
       [0.        , 0.25454545, 0.74545455]])
In [167]:
Confusion_Matrix_Plotter(confusion_matrix_lr_percent, 0)
In [168]:
print(classification_report(y_test, y_pred_lr, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.84      0.46      0.59        35
Medium Price       0.65      0.80      0.72       109
  High Price       0.80      0.75      0.77       110

    accuracy                           0.73       254
   macro avg       0.77      0.67      0.70       254
weighted avg       0.74      0.73      0.73       254

In [169]:
lr_t1_l, lr_t2_l, lr_t1_m, lr_t2_m, lr_t1_h, lr_t2_h = Compute_Error(confusion_matrix_lr)
Type1_Error_LowPrice: 3
Type2_Error_LowPrice: 19
Type1_Error_MediumPrice: 46
Type2_Error_MediumPrice: 22
Type1_Error_HighPrice: 20
Type2_Error_HighPrice: 28
In [170]:
lr_pl, lr_pm, lr_ph = precision_score(y_test, y_pred_lr, average=None)

lr_rl, lr_rm, lr_rh = recall_score(y_test, y_pred_lr, average=None)

lr_fl, lr_fm, lr_fh = f1_score(y_test, y_pred_lr, average=None)
In [171]:
cv_lr = cross_val_score(logreg, x_train, y_train, cv = 10, scoring='accuracy')
cv_lr
Out[171]:
array([0.80392157, 0.70588235, 0.7029703 , 0.79207921, 0.72277228,
       0.69306931, 0.75247525, 0.72277228, 0.76237624, 0.76237624])
In [172]:
cv_lr_m = cv_lr.mean()
print("Cross Validation Score:", cv_lr_m)
Cross Validation Score: 0.7420695010677537
In [173]:
plot_learning_curves(x_train, y_train, x_test, y_test, logreg)
plt.show()

Applying GridSearchCV.

In [174]:
logreg.get_params()
Out[174]:
{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}
In [175]:
parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-3, 3, 7), 'solver': ['newton-cg', 'lbfgs', 'liblinear']}
In [176]:
lr_gs = GridSearchCV(estimator = logreg, param_grid = parameters, scoring = 'accuracy', cv = 10, verbose=1)
In [177]:
lr_gs.fit(x_train, y_train)
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Out[177]:
GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='accuracy', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='accuracy', verbose=1)
LogisticRegression()
LogisticRegression()
In [178]:
print("GridSearch CV Best Parameters:", lr_gs.best_params_) 

print("\nGridSearch CV Best Estimator:", lr_gs.best_estimator_)

print("\nGridSearch CV Best score:", lr_gs.best_score_)

cv_lr_gs_b = lr_gs.best_score_
GridSearch CV Best Parameters: {'C': 1000.0, 'penalty': 'l2', 'solver': 'newton-cg'}

GridSearch CV Best Estimator: LogisticRegression(C=1000.0, solver='newton-cg')

GridSearch CV Best score: 0.7865657153950689
In [179]:
# Prediction on Testing Data
y_pred_lr = lr_gs.predict(x_test)
lr_gs_accuracy = metrics.accuracy_score(y_test, y_pred_lr)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_lr))
Accuracy: 0.7559055118110236
In [180]:
# Prediction on Training Data
y_pred2_lr = lr_gs.predict(x_train)
lr_gs_taccuracy = metrics.accuracy_score(y_train, y_pred2_lr)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_lr))
Accuracy: 0.791501976284585
In [181]:
confusion_matrix_lr = metrics.confusion_matrix(y_test, y_pred_lr)
confusion_matrix_lr
Out[181]:
array([[22, 12,  1],
       [ 6, 86, 17],
       [ 0, 26, 84]], dtype=int64)
In [182]:
Confusion_Matrix_Plotter(confusion_matrix_lr, 1)
In [183]:
confusion_matrix_lr_percent = confusion_matrix_lr.astype('float') / confusion_matrix_lr.sum(axis=1)[:, np.newaxis]
confusion_matrix_lr_percent
Out[183]:
array([[0.62857143, 0.34285714, 0.02857143],
       [0.05504587, 0.78899083, 0.1559633 ],
       [0.        , 0.23636364, 0.76363636]])
In [184]:
Confusion_Matrix_Plotter(confusion_matrix_lr_percent, 0)
In [185]:
print(classification_report(y_test, y_pred_lr, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.79      0.63      0.70        35
Medium Price       0.69      0.79      0.74       109
  High Price       0.82      0.76      0.79       110

    accuracy                           0.76       254
   macro avg       0.77      0.73      0.74       254
weighted avg       0.76      0.76      0.76       254

In [186]:
lr_gs_t1_l, lr_gs_t2_l, lr_gs_t1_m, lr_gs_t2_m, lr_gs_t1_h, lr_gs_t2_h = Compute_Error(confusion_matrix_lr)
Type1_Error_LowPrice: 6
Type2_Error_LowPrice: 13
Type1_Error_MediumPrice: 38
Type2_Error_MediumPrice: 23
Type1_Error_HighPrice: 18
Type2_Error_HighPrice: 26
In [187]:
lr_gs_pl, lr_gs_pm, lr_gs_ph = precision_score(y_test, y_pred_lr, average=None)

lr_gs_rl, lr_gs_rm, lr_gs_rh = recall_score(y_test, y_pred_lr, average=None)

lr_gs_fl, lr_gs_fm, lr_gs_fh = f1_score(y_test, y_pred_lr, average=None)
In [188]:
plot_learning_curves(x_train, y_train, x_test, y_test, lr_gs)
plt.show()
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits
Fitting 10 folds for each of 42 candidates, totalling 420 fits

AdaBoost Classifier¶

In [189]:
abc = AdaBoostClassifier()
abc.fit(x_train, y_train)
Out[189]:
AdaBoostClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostClassifier()
In [190]:
# Model Scores on training and test set
print("Training Set score:", abc.score(x_train, y_train))
print("Test Set score:", abc.score(x_test, y_test))
Training Set score: 0.7845849802371542
Test Set score: 0.7716535433070866
In [191]:
# Prediction on Testing Data
y_pred_abc = abc.predict(x_test)
abc_accuracy = metrics.accuracy_score(y_test, y_pred_abc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_abc))
Accuracy: 0.7716535433070866
In [192]:
# Prediction on Training Data
y_pred2_abc = abc.predict(x_train)
abc_taccuracy = metrics.accuracy_score(y_train, y_pred2_abc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_abc))
Accuracy: 0.7845849802371542
In [193]:
confusion_matrix_abc = metrics.confusion_matrix(y_test, y_pred_abc)
confusion_matrix_abc
Out[193]:
array([[21, 14,  0],
       [ 9, 88, 12],
       [ 0, 23, 87]], dtype=int64)
In [194]:
Confusion_Matrix_Plotter(confusion_matrix_abc, 1)
In [195]:
confusion_matrix_abc_percent = confusion_matrix_abc.astype('float') / confusion_matrix_abc.sum(axis=1)[:, np.newaxis]
confusion_matrix_abc_percent
Out[195]:
array([[0.6       , 0.4       , 0.        ],
       [0.08256881, 0.80733945, 0.11009174],
       [0.        , 0.20909091, 0.79090909]])
In [196]:
Confusion_Matrix_Plotter(confusion_matrix_abc_percent, 0)
In [197]:
print(classification_report(y_test, y_pred_abc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.70      0.60      0.65        35
Medium Price       0.70      0.81      0.75       109
  High Price       0.88      0.79      0.83       110

    accuracy                           0.77       254
   macro avg       0.76      0.73      0.74       254
weighted avg       0.78      0.77      0.77       254

In [198]:
abc_t1_l, abc_t2_l, abc_t1_m, abc_t2_m, abc_t1_h, abc_t2_h = Compute_Error(confusion_matrix_abc)
Type1_Error_LowPrice: 9
Type2_Error_LowPrice: 14
Type1_Error_MediumPrice: 37
Type2_Error_MediumPrice: 21
Type1_Error_HighPrice: 12
Type2_Error_HighPrice: 23
In [199]:
abc_pl, abc_pm, abc_ph = precision_score(y_test, y_pred_abc, average=None)

abc_rl, abc_rm, abc_rh = recall_score(y_test, y_pred_abc, average=None)

abc_fl, abc_fm, abc_fh = f1_score(y_test, y_pred_abc, average=None)
In [200]:
cv_abc = cross_val_score(abc, x_train, y_train, cv = 10, scoring='accuracy')
cv_abc
Out[200]:
array([0.73529412, 0.80392157, 0.73267327, 0.78217822, 0.78217822,
       0.76237624, 0.71287129, 0.75247525, 0.84158416, 0.84158416])
In [201]:
cv_abc_m = cv_abc.mean()
print("Cross Validation Score:", cv_abc_m)
Cross Validation Score: 0.7747136478353718
In [202]:
plot_learning_curves(x_train, y_train, x_test, y_test, abc)
plt.show()

K-Nearest Neighbours¶

In [203]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
Out[203]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()

Applying GridSearchCV.

In [204]:
knn.get_params()
Out[204]:
{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}
In [205]:
k_range = list(range(1, 31))
In [206]:
grid_params = {'n_neighbors': k_range, 'weights': ['uniform', 'distance'], 
               'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev']}
In [207]:
knn_gscv = GridSearchCV(KNeighborsClassifier(), grid_params, cv=10)
In [208]:
knn_gscv.fit(x_train, y_train)
Out[208]:
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski',
                                    'chebyshev'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski',
                                    'chebyshev'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})
KNeighborsClassifier()
KNeighborsClassifier()
In [209]:
# Model Scores on training and test set
print("Training Set score:", knn_gscv.score(x_train, y_train))
print("Test Set score:", knn_gscv.score(x_test, y_test))
Training Set score: 1.0
Test Set score: 0.8779527559055118
In [210]:
print("GridSearch CV Best Parameters:", knn_gscv.best_params_) 

print("\nGridSearch CV Best Estimator:", knn_gscv.best_estimator_)

print("\nGridSearch CV Best score:", knn_gscv.best_score_)

cv_knn_b = knn_gscv.best_score_
GridSearch CV Best Parameters: {'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'distance'}

GridSearch CV Best Estimator: KNeighborsClassifier(metric='manhattan', n_neighbors=6, weights='distance')

GridSearch CV Best score: 0.8873519704911667
In [211]:
# Prediction on Testing Data
y_pred_knn = knn_gscv.predict(x_test)
knn_accuracy = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_knn))
Accuracy: 0.8779527559055118
In [212]:
# Prediction on Training Data
y_pred2_knn = knn_gscv.predict(x_train)
knn_taccuracy = metrics.accuracy_score(y_train, y_pred2_knn)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_knn))
Accuracy: 1.0
In [213]:
confusion_matrix_knn = metrics.confusion_matrix(y_test, y_pred_knn)
confusion_matrix_knn
Out[213]:
array([[ 30,   5,   0],
       [ 10,  89,  10],
       [  0,   6, 104]], dtype=int64)
In [214]:
Confusion_Matrix_Plotter(confusion_matrix_knn, 1)
In [215]:
confusion_matrix_knn_percent = confusion_matrix_knn.astype('float') / confusion_matrix_knn.sum(axis=1)[:, np.newaxis]
confusion_matrix_knn_percent
Out[215]:
array([[0.85714286, 0.14285714, 0.        ],
       [0.09174312, 0.81651376, 0.09174312],
       [0.        , 0.05454545, 0.94545455]])
In [216]:
Confusion_Matrix_Plotter(confusion_matrix_knn_percent, 0)
In [217]:
print(classification_report(y_test, y_pred_knn, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.75      0.86      0.80        35
Medium Price       0.89      0.82      0.85       109
  High Price       0.91      0.95      0.93       110

    accuracy                           0.88       254
   macro avg       0.85      0.87      0.86       254
weighted avg       0.88      0.88      0.88       254

In [218]:
knn_t1_l, knn_t2_l, knn_t1_m, knn_t2_m, knn_t1_h, knn_t2_h = Compute_Error(confusion_matrix_knn)
Type1_Error_LowPrice: 10
Type2_Error_LowPrice: 5
Type1_Error_MediumPrice: 11
Type2_Error_MediumPrice: 20
Type1_Error_HighPrice: 10
Type2_Error_HighPrice: 6
In [219]:
knn_pl, knn_pm, knn_ph = precision_score(y_test, y_pred_knn, average=None)

knn_rl, knn_rm, knn_rh = recall_score(y_test, y_pred_knn, average=None)

knn_fl, knn_fm, knn_fh = f1_score(y_test, y_pred_knn, average=None)
In [220]:
plot_learning_curves(x_train, y_train, x_test, y_test, knn_gscv)
plt.show()

Decision Tree¶

In [221]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
Out[221]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [222]:
# Model Scores on training and test set
print("Training Set score:", dtc.score(x_train, y_train))
print("Test Set score:", dtc.score(x_test, y_test))
Training Set score: 1.0
Test Set score: 0.8385826771653543
In [223]:
def Plotter(df):
    plt.figure(figsize = (100, 35))
    print_tree = tree.plot_tree(df,
                       feature_names = features,
                       class_names = ['1','2','3'],
                       rounded = True,
                       filled = True)
    plt.show()
In [224]:
Plotter(dtc)
In [225]:
dtc_print_tree = tree.export_text(dtc, feature_names = features)
# print_tree
In [226]:
# Prediction on Testing Data
y_pred_dtc = dtc.predict(x_test)
dtc_accuracy = metrics.accuracy_score(y_test, y_pred_dtc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_dtc))
Accuracy: 0.8385826771653543
In [227]:
# Prediction on Training Data
y_pred2_dtc = dtc.predict(x_train)
dtc_taccuracy = metrics.accuracy_score(y_train, y_pred2_dtc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_dtc))
Accuracy: 1.0
In [228]:
confusion_matrix_dtc = metrics.confusion_matrix(y_test, y_pred_dtc)
confusion_matrix_dtc
Out[228]:
array([[30,  5,  0],
       [ 6, 89, 14],
       [ 1, 15, 94]], dtype=int64)
In [229]:
Confusion_Matrix_Plotter(confusion_matrix_dtc, 1)
In [230]:
confusion_matrix_dtc_percent = confusion_matrix_dtc.astype('float') / confusion_matrix_dtc.sum(axis=1)[:, np.newaxis]
confusion_matrix_dtc_percent
Out[230]:
array([[0.85714286, 0.14285714, 0.        ],
       [0.05504587, 0.81651376, 0.12844037],
       [0.00909091, 0.13636364, 0.85454545]])
In [231]:
Confusion_Matrix_Plotter(confusion_matrix_dtc_percent, 0)
In [232]:
print(classification_report(y_test, y_pred_dtc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.81      0.86      0.83        35
Medium Price       0.82      0.82      0.82       109
  High Price       0.87      0.85      0.86       110

    accuracy                           0.84       254
   macro avg       0.83      0.84      0.84       254
weighted avg       0.84      0.84      0.84       254

In [233]:
dt_t1_l, dt_t2_l, dt_t1_m, dt_t2_m, dt_t1_h, dt_t2_h = Compute_Error(confusion_matrix_dtc)
Type1_Error_LowPrice: 7
Type2_Error_LowPrice: 5
Type1_Error_MediumPrice: 20
Type2_Error_MediumPrice: 20
Type1_Error_HighPrice: 14
Type2_Error_HighPrice: 16
In [234]:
dt_pl, dt_pm, dt_ph = precision_score(y_test, y_pred_dtc, average=None)

dt_rl, dt_rm, dt_rh = recall_score(y_test, y_pred_dtc, average=None)

dt_fl, dt_fm, dt_fh = f1_score(y_test, y_pred_dtc, average=None)
In [235]:
cv_dt = cross_val_score(dtc, x_train, y_train, cv = 10, scoring='accuracy')
cv_dt
Out[235]:
array([0.81372549, 0.82352941, 0.76237624, 0.85148515, 0.88118812,
       0.85148515, 0.85148515, 0.86138614, 0.87128713, 0.8019802 ])
In [236]:
cv_dt_m = cv_dt.mean()
print("Cross Validation Score:", cv_dt_m)
Cross Validation Score: 0.8369928169287517
In [237]:
feature_importance = pd.Series(dtc.feature_importances_, index = features).sort_values(ascending = False)

sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
In [238]:
plot_learning_curves(x_train, y_train, x_test, y_test, dtc)
plt.show()

Finding the best parameter max_leaf_nodes using GridSearchCV()

In [239]:
dtc.get_params()
Out[239]:
{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}
In [240]:
leaf_nodes_list = list(range(1, 16))
In [241]:
parameters = {'max_leaf_nodes': leaf_nodes_list, 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 5, 10], 
              'min_samples_leaf': [1, 2, 3, 4, 5, 6]}
dt_gscv = GridSearchCV(estimator = dtc, param_grid = parameters, scoring = 'accuracy', cv = 10)
dt_gscv = dt_gscv.fit(x_train, y_train)
print("Best Parameters:", dt_gscv.best_params_)
Best Parameters: {'criterion': 'gini', 'max_leaf_nodes': 11, 'min_samples_leaf': 6, 'min_samples_split': 2}
In [242]:
print("GridSearch CV Best Parameters:", dt_gscv.best_params_) 

print("\nGridSearch CV Best Estimator:", dt_gscv.best_estimator_)

print("\nGridSearch CV Best score:", dt_gscv.best_score_)

cv_dt_gscv_b = dt_gscv.best_score_
GridSearch CV Best Parameters: {'criterion': 'gini', 'max_leaf_nodes': 11, 'min_samples_leaf': 6, 'min_samples_split': 2}

GridSearch CV Best Estimator: DecisionTreeClassifier(max_leaf_nodes=11, min_samples_leaf=6)

GridSearch CV Best score: 0.8300524170064065
In [243]:
nleaf_list = []
score_list = []
for i in range(2, 16):
    nleaf_list.append(i)
    parameters = {'max_leaf_nodes': [i]}
    grid_search = GridSearchCV(estimator = dtc, param_grid = parameters, scoring = 'accuracy', cv = 10)
    grid_search = grid_search.fit(x_train, y_train)
    score_list.append(grid_search.best_score_)

# Plot of tree sizes VS classification rate.
plt.plot(nleaf_list, score_list)
plt.scatter(nleaf_list, score_list)
plt.title("Plot of Tree Size VS Classification Rate")
# plt.grid(True)
Out[243]:
Text(0.5, 1.0, 'Plot of Tree Size VS Classification Rate')

Plotting the Pruned Tree

In [244]:
dt_gscv.fit(x_train, y_train)
Out[244]:
GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': [2, 5, 10]},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': [2, 5, 10]},
             scoring='accuracy')
DecisionTreeClassifier()
DecisionTreeClassifier()
In [245]:
dtc_pt = dt_gscv
In [246]:
dtc_pt2 = DecisionTreeClassifier(max_leaf_nodes = 11, min_samples_leaf = 6)
dtc_pt2.fit(x_train, y_train)
Plotter(dtc_pt2)
In [247]:
# Prediction on Testing Data
y_pred_dtc_pt = dtc_pt.predict(x_test)
dtc_pt_accuracy = metrics.accuracy_score(y_test, y_pred_dtc_pt)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_dtc_pt))
Accuracy: 0.8188976377952756
In [248]:
# Prediction on Training Data
y_pred2_dtc_pt = dtc_pt.predict(x_train)
dtc_pt_taccuracy = metrics.accuracy_score(y_train, y_pred2_dtc_pt)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_dtc_pt))
Accuracy: 0.8507905138339921
In [249]:
confusion_matrix_dtc_pt = metrics.confusion_matrix(y_test, y_pred_dtc_pt)
confusion_matrix_dtc_pt
Out[249]:
array([[ 29,   5,   1],
       [ 11,  75,  23],
       [  0,   6, 104]], dtype=int64)
In [250]:
# cm_df = pd.DataFrame(confusion_matrix_dtc_pt)
# sns.heatmap(cm_df, annot=True, fmt="d")
# plt.tight_layout()
In [251]:
Confusion_Matrix_Plotter(confusion_matrix_dtc_pt, 1)
In [252]:
confusion_matrix_dtc_pt_percent = confusion_matrix_dtc_pt.astype('float') / confusion_matrix_dtc_pt.sum(axis=1)[:, np.newaxis]
confusion_matrix_dtc_pt_percent
Out[252]:
array([[0.82857143, 0.14285714, 0.02857143],
       [0.10091743, 0.68807339, 0.21100917],
       [0.        , 0.05454545, 0.94545455]])
In [253]:
Confusion_Matrix_Plotter(confusion_matrix_dtc_pt_percent, 0)
In [254]:
print(classification_report(y_test, y_pred_dtc_pt, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.72      0.83      0.77        35
Medium Price       0.87      0.69      0.77       109
  High Price       0.81      0.95      0.87       110

    accuracy                           0.82       254
   macro avg       0.80      0.82      0.81       254
weighted avg       0.83      0.82      0.82       254

In [255]:
dtp_t1_l, dtp_t2_l, dtp_t1_m, dtp_t2_m, dtp_t1_h, dtp_t2_h = Compute_Error(confusion_matrix_dtc_pt)
Type1_Error_LowPrice: 11
Type2_Error_LowPrice: 6
Type1_Error_MediumPrice: 11
Type2_Error_MediumPrice: 34
Type1_Error_HighPrice: 24
Type2_Error_HighPrice: 6
In [256]:
dtp_pl, dtp_pm, dtp_ph = precision_score(y_test, y_pred_dtc_pt, average=None)

dtp_rl, dtp_rm, dtp_rh = recall_score(y_test, y_pred_dtc_pt, average=None)

dtp_fl, dtp_fm, dtp_fh = f1_score(y_test, y_pred_dtc_pt, average=None)
In [257]:
feature_importance = pd.Series(dtc_pt2.feature_importances_, index = features).sort_values(ascending = False)

sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
In [258]:
plot_learning_curves(x_train, y_train, x_test, y_test, dtc_pt2) #dtc_pt2 dt_gscv
plt.show()

Random Forest¶

Training a Random Forest with the best parameter max_leaf_nodes using RandomForestClassifier()

In [259]:
rfc = RandomForestClassifier(max_leaf_nodes = 11, n_estimators = 100)
rfc.fit(x_train, y_train)
Out[259]:
RandomForestClassifier(max_leaf_nodes=11)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_leaf_nodes=11)
In [260]:
# Model Scores on training and test set
print("Training Set score:", rfc.score(x_train, y_train))
print("Test Set score:", rfc.score(x_test, y_test))
Training Set score: 0.8596837944664032
Test Set score: 0.8307086614173228
In [261]:
# Prediction on Testing Data
y_pred_rfc = rfc.predict(x_test)
rfc_accuracy = metrics.accuracy_score(y_test, y_pred_rfc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rfc))
Accuracy: 0.8307086614173228
In [262]:
# Prediction on Training Data
y_pred2_rfc = rfc.predict(x_train)
rfc_taccuracy = metrics.accuracy_score(y_train, y_pred2_rfc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_rfc))
Accuracy: 0.8596837944664032
In [263]:
confusion_matrix_rfc = metrics.confusion_matrix(y_test, y_pred_rfc)
confusion_matrix_rfc
Out[263]:
array([[ 27,   7,   1],
       [  7,  82,  20],
       [  0,   8, 102]], dtype=int64)
In [264]:
Confusion_Matrix_Plotter(confusion_matrix_rfc, 1)
In [265]:
confusion_matrix_rfc_percent = confusion_matrix_rfc.astype('float') / confusion_matrix_rfc.sum(axis=1)[:, np.newaxis]
confusion_matrix_rfc_percent
Out[265]:
array([[0.77142857, 0.2       , 0.02857143],
       [0.06422018, 0.75229358, 0.18348624],
       [0.        , 0.07272727, 0.92727273]])
In [266]:
Confusion_Matrix_Plotter(confusion_matrix_rfc_percent, 0)
In [267]:
print(classification_report(y_test, y_pred_rfc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.79      0.77      0.78        35
Medium Price       0.85      0.75      0.80       109
  High Price       0.83      0.93      0.88       110

    accuracy                           0.83       254
   macro avg       0.82      0.82      0.82       254
weighted avg       0.83      0.83      0.83       254

In [268]:
rfc_t1_l, rfc_t2_l, rfc_t1_m, rfc_t2_m, rfc_t1_h, rfc_t2_h = Compute_Error(confusion_matrix_rfc)
Type1_Error_LowPrice: 7
Type2_Error_LowPrice: 8
Type1_Error_MediumPrice: 15
Type2_Error_MediumPrice: 27
Type1_Error_HighPrice: 21
Type2_Error_HighPrice: 8
In [269]:
rfc_pl, rfc_pm, rfc_ph = precision_score(y_test, y_pred_rfc, average=None)

rfc_rl, rfc_rm, rfc_rh = recall_score(y_test, y_pred_rfc, average=None)

rfc_fl, rfc_fm, rfc_fh = f1_score(y_test, y_pred_rfc, average=None)
In [270]:
cv_rfc = cross_val_score(rfc, x_train, y_train, cv = 10, scoring='accuracy')
cv_rfc
Out[270]:
array([0.85294118, 0.78431373, 0.8019802 , 0.9009901 , 0.82178218,
       0.84158416, 0.81188119, 0.82178218, 0.85148515, 0.81188119])
In [271]:
cv_rfc_m = cv_rfc.mean()
print("Cross Validation Score:", cv_rfc_m)
Cross Validation Score: 0.8300621238594449
In [272]:
feature_importance = pd.Series(rfc.feature_importances_, index = features).sort_values(ascending = False)

sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
In [273]:
plot_learning_curves(x_train, y_train, x_test, y_test, rfc)
plt.show()

Random Forest with Randomized Search CV¶

In [274]:
rfc.get_params()
Out[274]:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': 11,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
In [275]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators
Out[275]:
[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
In [276]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_depth
Out[276]:
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]
In [277]:
leaf_nodes_list = list(range(1, 16))
In [278]:
random_grid = {'n_estimators': n_estimators, 'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth, 'max_leaf_nodes': leaf_nodes_list,
               'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6], 'bootstrap': [True, False]}
In [279]:
rfc_rscv = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, cv = 10, 
                              verbose=1, n_jobs = -1)
In [280]:
rfc_rscv.fit(x_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Out[280]:
RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7,
                                                           8, 9, 10, 11, 12, 13,
                                                           14, 15],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7,
                                                           8, 9, 10, 11, 12, 13,
                                                           14, 15],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=1)
RandomForestClassifier()
RandomForestClassifier()
In [281]:
print("GridSearch CV Best Parameters:", rfc_rscv.best_params_) 

print("\nGridSearch CV Best Estimator:", rfc_rscv.best_estimator_)

print("\nGridSearch CV Best score:", rfc_rscv.best_score_)

cv_rfc_rscv_b = rfc_rscv.best_score_
GridSearch CV Best Parameters: {'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_leaf_nodes': 14, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}

GridSearch CV Best Estimator: RandomForestClassifier(bootstrap=False, max_features='auto', max_leaf_nodes=14,
                       min_samples_leaf=4, min_samples_split=5,
                       n_estimators=1600)

GridSearch CV Best score: 0.8350223257619879
In [282]:
# Model Scores on training and test set
print("Training Set score:", rfc_rscv.score(x_train, y_train))
print("Test Set score:", rfc_rscv.score(x_test, y_test))
Training Set score: 0.8616600790513834
Test Set score: 0.8267716535433071
In [283]:
# Prediction on Testing Data
y_pred_rfc_rscv = rfc_rscv.predict(x_test)
rfc_rscv_accuracy = metrics.accuracy_score(y_test, y_pred_rfc_rscv)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rfc_rscv))
Accuracy: 0.8267716535433071
In [284]:
# Prediction on Training Data
y_pred2_rfc_rscv = rfc_rscv.predict(x_train)
rfc_rscv_taccuracy = metrics.accuracy_score(y_train, y_pred2_rfc_rscv)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_rfc_rscv))
Accuracy: 0.8616600790513834
In [285]:
confusion_matrix_rfc_rscv = metrics.confusion_matrix(y_test, y_pred_rfc_rscv)
confusion_matrix_rfc_rscv
Out[285]:
array([[ 27,   7,   1],
       [  8,  80,  21],
       [  0,   7, 103]], dtype=int64)
In [286]:
Confusion_Matrix_Plotter(confusion_matrix_rfc_rscv, 1)
In [287]:
confusion_matrix_rfc_rscv_percent = confusion_matrix_rfc_rscv.astype('float') / confusion_matrix_rfc_rscv.sum(axis=1)[:, np.newaxis]
confusion_matrix_rfc_rscv_percent
Out[287]:
array([[0.77142857, 0.2       , 0.02857143],
       [0.0733945 , 0.73394495, 0.19266055],
       [0.        , 0.06363636, 0.93636364]])
In [288]:
Confusion_Matrix_Plotter(confusion_matrix_rfc_rscv_percent, 0)
In [289]:
print(classification_report(y_test, y_pred_rfc_rscv, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.77      0.77      0.77        35
Medium Price       0.85      0.73      0.79       109
  High Price       0.82      0.94      0.88       110

    accuracy                           0.83       254
   macro avg       0.82      0.81      0.81       254
weighted avg       0.83      0.83      0.82       254

In [290]:
rfc_rscv_t1_l, rfc_rscv_t2_l, rfc_rscv_t1_m, rfc_rscv_t2_m, rfc_rscv_t1_h, rfc_rscv_t2_h = Compute_Error(confusion_matrix_rfc_rscv)
Type1_Error_LowPrice: 8
Type2_Error_LowPrice: 8
Type1_Error_MediumPrice: 14
Type2_Error_MediumPrice: 29
Type1_Error_HighPrice: 22
Type2_Error_HighPrice: 7
In [291]:
rfc_rscv_pl, rfc_rscv_pm, rfc_rscv_ph = precision_score(y_test, y_pred_rfc_rscv, average=None)

rfc_rscv_rl, rfc_rscv_rm, rfc_rscv_rh = recall_score(y_test, y_pred_rfc_rscv, average=None)

rfc_rscv_fl, rfc_rscv_fm, rfc_rscv_fh = f1_score(y_test, y_pred_rfc_rscv, average=None)
In [292]:
rfc2 = RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5, min_samples_split=10, n_estimators=400)
rfc2.fit(x_train, y_train)
Out[292]:
RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5,
                       min_samples_split=10, n_estimators=400)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_leaf_nodes=14, min_samples_leaf=5,
                       min_samples_split=10, n_estimators=400)
In [293]:
feature_importance = pd.Series(rfc2.feature_importances_, index = features).sort_values(ascending = False)

sns.barplot(x = feature_importance, y = feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.tight_layout()
In [294]:
# plot_learning_curves(x_train, y_train, x_test, y_test, rfc_rscv)
# plt.show()

Artificial Neural Networks¶

hidden_layer_sizes : This parameter allows us to set the number of layers and the number of nodes we wish to have in the Neural Network Classifier. Each element in the tuple represents the number of nodes at the ith position where i is the index of the tuple. Thus the length of tuple denotes the total number of hidden layers in the network.

In [295]:
mlp = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(48, 24, 12), alpha=1e-06, max_iter=5000)
In [296]:
mlp.fit(x_train, y_train)
Out[296]:
MLPClassifier(activation='logistic', alpha=1e-06,
              hidden_layer_sizes=(48, 24, 12), max_iter=5000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MLPClassifier(activation='logistic', alpha=1e-06,
              hidden_layer_sizes=(48, 24, 12), max_iter=5000)
In [297]:
# Prediction on Testing Data
y_pred_mlp = mlp.predict(x_test)
# Accuracy Score = (TP + TN)/ (TP + FN + TN + FP) 
mlp_accuracy = metrics.accuracy_score(y_test, y_pred_mlp)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_mlp))
Accuracy: 0.42913385826771655
In [298]:
# Prediction on Training Data
y_pred2_mlp = mlp.predict(x_train)
mlp_taccuracy = metrics.accuracy_score(y_train, y_pred2_mlp)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_mlp))
Accuracy: 0.4426877470355731
In [299]:
# Mean accuracy on the given test data and label
mlp.score(x_test, y_pred_mlp)
Out[299]:
1.0
In [300]:
# Model Scores on training and test set
print("Training Set score:", mlp.score(x_train, y_train))
print("Test Set score:", mlp.score(x_test, y_test))
Training Set score: 0.4426877470355731
Test Set score: 0.42913385826771655
In [301]:
y_pred_mlp
Out[301]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
In [302]:
print("Number of layers:", mlp.n_layers_)
print("Number of iterations the solver has run:", mlp.n_iter_)
print("Computed Loss:", mlp.loss_)
print("Minimum loss reached by the solver throughout fitting:", mlp.best_loss_)
print("Number of features seen during fit:", mlp.n_features_in_)
print("Output activation function:", mlp.out_activation_) #logistic sigmoid function: returns f(x) = 1 / (1 + exp(-x)).
Number of layers: 5
Number of iterations the solver has run: 28
Computed Loss: 1.0343383110256572
Minimum loss reached by the solver throughout fitting: 1.0335607958416024
Number of features seen during fit: 5
Output activation function: softmax
In [303]:
confusion_matrix_mlp = metrics.confusion_matrix(y_test, y_pred_mlp)
confusion_matrix_mlp
Out[303]:
array([[  0,  35,   0],
       [  0, 109,   0],
       [  0, 110,   0]], dtype=int64)
In [304]:
Confusion_Matrix_Plotter(confusion_matrix_mlp, 1)
In [305]:
confusion_matrix_mlp_percent = confusion_matrix_mlp.astype('float') / confusion_matrix_mlp.sum(axis=1)[:, np.newaxis]
confusion_matrix_mlp_percent
Out[305]:
array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])
In [306]:
Confusion_Matrix_Plotter(confusion_matrix_mlp_percent, 0)
In [307]:
print(classification_report(y_test, y_pred_mlp, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.00      0.00      0.00        35
Medium Price       0.43      1.00      0.60       109
  High Price       0.00      0.00      0.00       110

    accuracy                           0.43       254
   macro avg       0.14      0.33      0.20       254
weighted avg       0.18      0.43      0.26       254

In [308]:
mlp1_t1_l, mlp1_t2_l, mlp1_t1_m, mlp1_t2_m, mlp1_t1_h, mlp1_t2_h = Compute_Error(confusion_matrix_mlp)
Type1_Error_LowPrice: 0
Type2_Error_LowPrice: 35
Type1_Error_MediumPrice: 145
Type2_Error_MediumPrice: 0
Type1_Error_HighPrice: 0
Type2_Error_HighPrice: 110
In [309]:
mlp1_pl, mlp1_pm, mlp1_ph = precision_score(y_test, y_pred_mlp, average=None)

mlp1_rl, mlp1_rm, mlp1_rh = recall_score(y_test, y_pred_mlp, average=None)

mlp1_fl, mlp1_fm, mlp1_fh = f1_score(y_test, y_pred_mlp, average=None)
In [310]:
plt.plot(mlp.loss_curve_)
plt.title("Loss Curve")
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()

Neural Network with a different network structure

In [311]:
mlp2 = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(500, 250), alpha=1e-08, max_iter=5000)
mlp2.fit(x_train, y_train)
Out[311]:
MLPClassifier(activation='logistic', alpha=1e-08, hidden_layer_sizes=(500, 250),
              max_iter=5000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MLPClassifier(activation='logistic', alpha=1e-08, hidden_layer_sizes=(500, 250),
              max_iter=5000)
In [312]:
# Prediction on Testing Data
y_pred_mlp2 = mlp2.predict(x_test)
mlp2_accuracy = metrics.accuracy_score(y_test, y_pred_mlp2)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_mlp2))
Accuracy: 0.7047244094488189
In [313]:
# Prediction on Training Data
y_pred2_mlp2 = mlp2.predict(x_train)
mlp2_taccuracy = metrics.accuracy_score(y_train, y_pred2_mlp2)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_mlp2))
Accuracy: 0.733201581027668
In [314]:
# Mean accuracy on the given test data and label
mlp2.score(x_test, y_pred_mlp2)
Out[314]:
1.0
In [315]:
# Model Scores on training and test set
print("Training Set score:", mlp2.score(x_train, y_train))
print("Test Set score:", mlp2.score(x_test, y_test))
Training Set score: 0.733201581027668
Test Set score: 0.7047244094488189
In [316]:
y_pred_mlp2
Out[316]:
array([1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1,
       0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 2, 1, 2, 2, 1, 1, 1, 1,
       2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 0, 2, 1, 2, 1, 0, 1, 1, 2,
       2, 1, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2,
       1, 1, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 2, 1, 2, 1, 0, 1,
       1, 2, 0, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2,
       2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 0, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 2, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1,
       2, 0, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1,
       1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1,
       2, 0, 1, 2, 2, 1, 1, 1, 2, 2, 0, 2], dtype=int64)
In [317]:
print("Number of layers:", mlp2.n_layers_)
print("Number of iterations the solver has run:", mlp2.n_iter_)
print("Computed Loss:", mlp2.loss_)
print("Minimum loss reached by the solver throughout fitting:", mlp2.best_loss_)
print("Number of features seen during fit:", mlp2.n_features_in_)
print("Output activation function:", mlp2.out_activation_)
Number of layers: 4
Number of iterations the solver has run: 93
Computed Loss: 0.6468561423488488
Minimum loss reached by the solver throughout fitting: 0.5997767700479348
Number of features seen during fit: 5
Output activation function: softmax
In [318]:
confusion_matrix_mlp2 = metrics.confusion_matrix(y_test, y_pred_mlp2)
confusion_matrix_mlp2
Out[318]:
array([[15, 19,  1],
       [ 4, 90, 15],
       [ 0, 36, 74]], dtype=int64)
In [319]:
Confusion_Matrix_Plotter(confusion_matrix_mlp2, 1)
In [320]:
confusion_matrix_mlp2_percent = confusion_matrix_mlp2.astype('float') / confusion_matrix_mlp2.sum(axis=1)[:, np.newaxis]
confusion_matrix_mlp2_percent
Out[320]:
array([[0.42857143, 0.54285714, 0.02857143],
       [0.03669725, 0.82568807, 0.13761468],
       [0.        , 0.32727273, 0.67272727]])
In [321]:
Confusion_Matrix_Plotter(confusion_matrix_mlp2_percent, 0)
In [322]:
print(classification_report(y_test, y_pred_mlp2, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.79      0.43      0.56        35
Medium Price       0.62      0.83      0.71       109
  High Price       0.82      0.67      0.74       110

    accuracy                           0.70       254
   macro avg       0.74      0.64      0.67       254
weighted avg       0.73      0.70      0.70       254

In [323]:
mlp2_t1_l, mlp2_t2_l, mlp2_t1_m, mlp2_t2_m, mlp2_t1_h, mlp2_t2_h = Compute_Error(confusion_matrix_mlp2)
Type1_Error_LowPrice: 4
Type2_Error_LowPrice: 20
Type1_Error_MediumPrice: 55
Type2_Error_MediumPrice: 19
Type1_Error_HighPrice: 16
Type2_Error_HighPrice: 36
In [324]:
mlp2_pl, mlp2_pm, mlp2_ph = precision_score(y_test, y_pred_mlp2, average=None)

mlp2_rl, mlp2_rm, mlp2_rh = recall_score(y_test, y_pred_mlp2, average=None)

mlp2_fl, mlp2_fm, mlp2_fh = f1_score(y_test, y_pred_mlp2, average=None)
In [325]:
plt.plot(mlp2.loss_curve_)
plt.title("Loss Curve")
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()
In [326]:
improvement_mlp = mlp2_accuracy - mlp_accuracy
improvement_mlp * 100
Out[326]:
27.55905511811023

XGBoost¶

In [327]:
from xgboost import XGBClassifier
In [328]:
xgbc = XGBClassifier()
In [329]:
print(xgbc)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
In [330]:
xgbc.fit(x_train, y_train)
Out[330]:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', ...)
In [331]:
# Model Scores on training and test set
print("Training Set score:", xgbc.score(x_train, y_train))
print("Test Set score:", xgbc.score(x_test, y_test))
Training Set score: 1.0
Test Set score: 0.8622047244094488
In [332]:
# Prediction on Testing Data
y_pred_xgbc = xgbc.predict(x_test)
xgbc_accuracy = metrics.accuracy_score(y_test, y_pred_xgbc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_xgbc))
Accuracy: 0.8622047244094488
In [333]:
# Prediction on Training Data
y_pred2_xgbc = xgbc.predict(x_train)
xgbc_taccuracy = metrics.accuracy_score(y_train, y_pred2_xgbc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_xgbc))
Accuracy: 1.0
In [334]:
confusion_matrix_xgbc = metrics.confusion_matrix(y_test, y_pred_xgbc)
confusion_matrix_xgbc
Out[334]:
array([[ 29,   6,   0],
       [ 10,  87,  12],
       [  0,   7, 103]], dtype=int64)
In [335]:
Confusion_Matrix_Plotter(confusion_matrix_xgbc, 1)
In [336]:
confusion_matrix_xgbc_percent = confusion_matrix_xgbc.astype('float') / confusion_matrix_xgbc.sum(axis=1)[:, np.newaxis]
confusion_matrix_xgbc_percent
Out[336]:
array([[0.82857143, 0.17142857, 0.        ],
       [0.09174312, 0.79816514, 0.11009174],
       [0.        , 0.06363636, 0.93636364]])
In [337]:
Confusion_Matrix_Plotter(confusion_matrix_xgbc_percent, 0)
In [338]:
print(classification_report(y_test, y_pred_xgbc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.74      0.83      0.78        35
Medium Price       0.87      0.80      0.83       109
  High Price       0.90      0.94      0.92       110

    accuracy                           0.86       254
   macro avg       0.84      0.85      0.84       254
weighted avg       0.86      0.86      0.86       254

In [339]:
xgbc_t1_l, xgbc_t2_l, xgbc_t1_m, xgbc_t2_m, xgbc_t1_h, xgbc_t2_h = Compute_Error(confusion_matrix_xgbc)
Type1_Error_LowPrice: 10
Type2_Error_LowPrice: 6
Type1_Error_MediumPrice: 13
Type2_Error_MediumPrice: 22
Type1_Error_HighPrice: 12
Type2_Error_HighPrice: 7
In [340]:
xgbc_pl, xgbc_pm, xgbc_ph = precision_score(y_test, y_pred_xgbc, average=None)

xgbc_rl, xgbc_rm, xgbc_rh = recall_score(y_test, y_pred_xgbc, average=None)

xgbc_fl, xgbc_fm, xgbc_fh = f1_score(y_test, y_pred_xgbc, average=None)
In [341]:
cv_xgbc = cross_val_score(xgbc, x_train, y_train, cv = 10, scoring='accuracy')
cv_xgbc
Out[341]:
array([0.8627451 , 0.83333333, 0.84158416, 0.91089109, 0.88118812,
       0.89108911, 0.87128713, 0.88118812, 0.89108911, 0.86138614])
In [342]:
cv_xgbc_m = cv_xgbc.mean()
print("Cross Validation Score:", cv_xgbc_m)
Cross Validation Score: 0.8725781401669579
In [343]:
plot_learning_curves(x_train, y_train, x_test, y_test, xgbc)
plt.show()

Stochastic Gradient Descent¶

In [344]:
from sklearn.linear_model import SGDClassifier
In [345]:
sgd = SGDClassifier()
In [346]:
sgd.get_params()
Out[346]:
{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}
In [347]:
sgd.fit(x_train, y_train)
Out[347]:
SGDClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier()
In [348]:
# Model Scores on training and test set
print("Training Set score:", sgd.score(x_train, y_train))
print("Test Set score:", sgd.score(x_test, y_test))
Training Set score: 0.7262845849802372
Test Set score: 0.7165354330708661
In [349]:
# Prediction on Testing Data
y_pred_sgd = sgd.predict(x_test)
sgd_accuracy = metrics.accuracy_score(y_test, y_pred_sgd)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_sgd))
Accuracy: 0.7165354330708661
In [350]:
# Prediction on Training Data
y_pred2_sgd = sgd.predict(x_train)
sgd_taccuracy = metrics.accuracy_score(y_train, y_pred2_sgd)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_sgd))
Accuracy: 0.7262845849802372
In [351]:
confusion_matrix_sgd = metrics.confusion_matrix(y_test, y_pred_sgd)
confusion_matrix_sgd
Out[351]:
array([[10, 24,  1],
       [ 5, 83, 21],
       [ 0, 21, 89]], dtype=int64)
In [352]:
Confusion_Matrix_Plotter(confusion_matrix_sgd, 1)
In [353]:
confusion_matrix_sgdc_percent = confusion_matrix_sgd.astype('float') / confusion_matrix_sgd.sum(axis=1)[:, np.newaxis]
confusion_matrix_sgdc_percent
Out[353]:
array([[0.28571429, 0.68571429, 0.02857143],
       [0.04587156, 0.76146789, 0.19266055],
       [0.        , 0.19090909, 0.80909091]])
In [354]:
Confusion_Matrix_Plotter(confusion_matrix_sgdc_percent, 0)
In [355]:
print(classification_report(y_test, y_pred_sgd, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.67      0.29      0.40        35
Medium Price       0.65      0.76      0.70       109
  High Price       0.80      0.81      0.81       110

    accuracy                           0.72       254
   macro avg       0.71      0.62      0.64       254
weighted avg       0.72      0.72      0.70       254

In [356]:
sgd_t1_l, sgd_t2_l, sgd_t1_m, sgd_t2_m, sgd_t1_h, sgd_t2_h = Compute_Error(confusion_matrix_sgd)
Type1_Error_LowPrice: 5
Type2_Error_LowPrice: 25
Type1_Error_MediumPrice: 45
Type2_Error_MediumPrice: 26
Type1_Error_HighPrice: 22
Type2_Error_HighPrice: 21
In [357]:
sgd_pl, sgd_pm, sgd_ph = precision_score(y_test, y_pred_sgd, average=None)

sgd_rl, sgd_rm, sgd_rh = recall_score(y_test, y_pred_sgd, average=None)

sgd_fl, sgd_fm, sgd_fh = f1_score(y_test, y_pred_sgd, average=None)
In [358]:
cv_sgd = cross_val_score(sgd, x_train, y_train, cv = 10, scoring='accuracy')
cv_sgd
Out[358]:
array([0.7745098 , 0.74509804, 0.67326733, 0.76237624, 0.73267327,
       0.7029703 , 0.81188119, 0.74257426, 0.73267327, 0.73267327])
In [359]:
cv_sgd_m = cv_sgd.mean()
print("Cross Validation Score:", cv_sgd_m)
Cross Validation Score: 0.7410696952048145
In [360]:
plot_learning_curves(x_train, y_train, x_test, y_test, sgd)
plt.show()

Gradient Boosting Classifier¶

In [361]:
from sklearn.ensemble import GradientBoostingClassifier
In [362]:
gbc = GradientBoostingClassifier()
In [363]:
gbc.get_params()
Out[363]:
{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}
In [364]:
gbc.fit(x_train, y_train)
Out[364]:
GradientBoostingClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier()
In [365]:
# Model Scores on training and test set
print("Training Set score:", gbc.score(x_train, y_train))
print("Test Set score:", gbc.score(x_test, y_test))
Training Set score: 0.974308300395257
Test Set score: 0.8661417322834646
In [366]:
# Prediction on Testing Data
y_pred_gbc = gbc.predict(x_test)
gbc_accuracy = metrics.accuracy_score(y_test, y_pred_gbc)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_gbc))
Accuracy: 0.8661417322834646
In [367]:
# Prediction on Training Data
y_pred2_gbc = gbc.predict(x_train)
gbc_taccuracy = metrics.accuracy_score(y_train, y_pred2_gbc)
print("Accuracy:", metrics.accuracy_score(y_train, y_pred2_gbc))
Accuracy: 0.974308300395257
In [368]:
confusion_matrix_gbc = metrics.confusion_matrix(y_test, y_pred_gbc)
confusion_matrix_gbc
Out[368]:
array([[ 28,   7,   0],
       [  7,  91,  11],
       [  0,   9, 101]], dtype=int64)
In [369]:
Confusion_Matrix_Plotter(confusion_matrix_gbc, 1)
In [370]:
confusion_matrix_gbc_percent = confusion_matrix_gbc.astype('float') / confusion_matrix_gbc.sum(axis=1)[:, np.newaxis]
confusion_matrix_gbc_percent
Out[370]:
array([[0.8       , 0.2       , 0.        ],
       [0.06422018, 0.83486239, 0.10091743],
       [0.        , 0.08181818, 0.91818182]])
In [371]:
Confusion_Matrix_Plotter(confusion_matrix_gbc_percent, 0)
In [372]:
print(classification_report(y_test, y_pred_gbc, target_names=["Low Price", "Medium Price", "High Price"]))
              precision    recall  f1-score   support

   Low Price       0.80      0.80      0.80        35
Medium Price       0.85      0.83      0.84       109
  High Price       0.90      0.92      0.91       110

    accuracy                           0.87       254
   macro avg       0.85      0.85      0.85       254
weighted avg       0.87      0.87      0.87       254

In [373]:
gbc_t1_l, gbc_t2_l, gbc_t1_m, gbc_t2_m, gbc_t1_h, gbc_t2_h = Compute_Error(confusion_matrix_gbc)
Type1_Error_LowPrice: 7
Type2_Error_LowPrice: 7
Type1_Error_MediumPrice: 16
Type2_Error_MediumPrice: 18
Type1_Error_HighPrice: 11
Type2_Error_HighPrice: 9
In [374]:
gbc_pl, gbc_pm, gbc_ph = precision_score(y_test, y_pred_gbc, average=None)

gbc_rl, gbc_rm, gbc_rh = recall_score(y_test, y_pred_gbc, average=None)

gbc_fl, gbc_fm, gbc_fh = f1_score(y_test, y_pred_gbc, average=None)
In [375]:
cv_gbc = cross_val_score(gbc, x_train, y_train, cv = 10, scoring='accuracy')
cv_gbc
Out[375]:
array([0.8627451 , 0.83333333, 0.84158416, 0.88118812, 0.86138614,
       0.9009901 , 0.87128713, 0.88118812, 0.87128713, 0.83168317])
In [376]:
cv_gbc_m = cv_gbc.mean()
print("Cross Validation Score:", cv_sgd_m)
Cross Validation Score: 0.7410696952048145
In [377]:
plot_learning_curves(x_train, y_train, x_test, y_test, gbc)
plt.show()

Comparing Performance of Various Models¶

In [378]:
models_error = [('Naive Bayes', nb_t1_l, nb_t2_l, nb_t1_m, nb_t2_m, nb_t1_h, nb_t2_h),
('Support Vector Classification', svc_t1_l, svc_t2_l, svc_t1_m, svc_t2_m, svc_t1_h, svc_t2_h),
('Support Vector Classification with GridSearchCV', svc_gs_t1_l, svc_gs_t2_l, svc_gs_t1_m, svc_gs_t2_m, svc_gs_t1_h, svc_gs_t2_h),
('Logistic Regression', lr_t1_l, lr_t2_l, lr_t1_m, lr_t2_m, lr_t1_h, lr_t2_h),
('Logistic Regression with GridSearchCV', lr_gs_t1_l, lr_gs_t2_l, lr_gs_t1_m, lr_gs_t2_m, lr_gs_t1_h, lr_gs_t2_h),
('AdaBoost Classifier', abc_t1_l, abc_t2_l, abc_t1_m, abc_t2_m, abc_t1_h, abc_t2_h),
('K-Nearest Neighbors with GridSearchCV', knn_t1_l, knn_t2_l, knn_t1_m, knn_t2_m, knn_t1_h, knn_t2_h),
('Decision Trees', dt_t1_l, dt_t2_l, dt_t1_m, dt_t2_m, dt_t1_h, dt_t2_h),
('Decision Trees with GridSearchCV', dtp_t1_l, dtp_t2_l, dtp_t1_m, dtp_t2_m, dtp_t1_h, dtp_t2_h),
('Random Forest Classifier', rfc_t1_l, rfc_t2_l, rfc_t1_m, rfc_t2_m, rfc_t1_h, rfc_t2_h),
('Random Forest with RandomizedSearchCV', rfc_rscv_t1_l, rfc_rscv_t2_l, rfc_rscv_t1_m, rfc_rscv_t2_m, rfc_rscv_t1_h, rfc_rscv_t2_h),
('Neural Network (48, 24, 12)', mlp1_t1_l, mlp1_t2_l, mlp1_t1_m, mlp1_t2_m, mlp1_t1_h, mlp1_t2_h),
('Neural Network (500, 250)', mlp2_t1_l, mlp2_t2_l, mlp2_t1_m, mlp2_t2_m, mlp2_t1_h, mlp2_t2_h),
('XGBoost Classifier', xgbc_t1_l, xgbc_t2_l, xgbc_t1_m, xgbc_t2_m, xgbc_t1_h, xgbc_t2_h),
('Stochastic Gradient Descent', sgd_t1_l, sgd_t2_l, sgd_t1_m, sgd_t2_m, sgd_t1_h, sgd_t2_h),
('Gradient Boosting Classifier', gbc_t1_l, gbc_t2_l, gbc_t1_m, gbc_t2_m, gbc_t1_h, gbc_t2_h)]
In [379]:
error_data = pd.DataFrame(data = models_error, columns=['Model', 'Type1 Error Low Price', 'Type2 Error Low Price', 
                                                        'Type1 Error Medium Price', 'Type2 Error Medium Price', 
                                                        'Type1 Error High Price', 'Type2 Error High Price'])
error_data
Out[379]:
Model Type1 Error Low Price Type2 Error Low Price Type1 Error Medium Price Type2 Error Medium Price Type1 Error High Price Type2 Error High Price
0 Naive Bayes 12 17 31 45 35 16
1 Support Vector Classification 8 11 25 25 18 15
2 Support Vector Classification with GridSearchCV 8 10 22 25 18 13
3 Logistic Regression 3 19 46 22 20 28
4 Logistic Regression with GridSearchCV 6 13 38 23 18 26
5 AdaBoost Classifier 9 14 37 21 12 23
6 K-Nearest Neighbors with GridSearchCV 10 5 11 20 10 6
7 Decision Trees 7 5 20 20 14 16
8 Decision Trees with GridSearchCV 11 6 11 34 24 6
9 Random Forest Classifier 7 8 15 27 21 8
10 Random Forest with RandomizedSearchCV 8 8 14 29 22 7
11 Neural Network (48, 24, 12) 0 35 145 0 0 110
12 Neural Network (500, 250) 4 20 55 19 16 36
13 XGBoost Classifier 10 6 13 22 12 7
14 Stochastic Gradient Descent 5 25 45 26 22 21
15 Gradient Boosting Classifier 7 7 16 18 11 9
In [380]:
models_score = [('Naive Bayes', nb_pl, nb_pm, nb_ph, nb_rl, nb_rm, nb_rh, nb_fl, nb_fm, nb_fh),
                ('Support Vector Classification', svc_pl, svc_pm, svc_ph, svc_rl, svc_rm, svc_rh, svc_fl, svc_fm, svc_fh),
                ('Support Vector Classification with GridSearchCV', svc_gs_pl, svc_gs_pm, svc_gs_ph, svc_gs_rl, svc_gs_rm, svc_gs_rh, svc_gs_fl, svc_gs_fm, svc_gs_fh),
                ('Logistic Regression', lr_pl, lr_pm, lr_ph, lr_rl, lr_rm, lr_rh, lr_fl, lr_fm, lr_fh),
                ('Logistic Regression with GridSearchCV', lr_gs_pl, lr_gs_pm, lr_gs_ph, lr_gs_rl, lr_gs_rm, lr_gs_rh, lr_gs_fl, lr_gs_fm, lr_gs_fh),
                ('AdaBoost Classifier', abc_pl, abc_pm, abc_ph, abc_rl, abc_rm, abc_rh, abc_fl, abc_fm, abc_fh),
                ('K-Nearest Neighbors with GridSearchCV', knn_pl, knn_pm, knn_ph, knn_rl, knn_rm, knn_rh, knn_fl, knn_fm, knn_fh),
                ('Decision Trees', dt_pl, dt_pm, dt_ph, dt_rl, dt_rm, dt_rh, dt_fl, dt_fm, dt_fh),
                ('Decision Trees with GridSearchCV', dtp_pl, dtp_pm, dtp_ph, dtp_rl, dtp_rm, dtp_rh, dtp_fl, dtp_fm, dtp_fh),
                ('Random Forest Classifier', rfc_pl, rfc_pm, rfc_ph, rfc_rl, rfc_rm, rfc_rh, rfc_fl, rfc_fm, rfc_fh),
                ('Random Forest with RandomizedSearchCV', rfc_rscv_pl, rfc_rscv_pm, rfc_rscv_ph, rfc_rscv_rl, rfc_rscv_rm, rfc_rscv_rh, rfc_rscv_fl, rfc_rscv_fm, rfc_rscv_fh),
                ('Neural Network (48, 24, 12)', mlp1_pl, mlp1_pm, mlp1_ph, mlp1_rl, mlp1_rm, mlp1_rh, mlp1_fl, mlp1_fm, mlp1_fh),
                ('Neural Network (500, 250)', mlp2_pl, mlp2_pm, mlp2_ph, mlp2_rl, mlp2_rm, mlp2_rh, mlp2_fl, mlp2_fm, mlp2_fh),
                ('XGBoost Classifier', xgbc_pl, xgbc_pm, xgbc_ph, xgbc_rl, xgbc_rm, xgbc_rh, xgbc_fl, xgbc_fm, xgbc_fh),
                ('Stochastic Gradient Descent', sgd_pl, sgd_pm, sgd_ph, sgd_rl, sgd_rm, sgd_rh, sgd_fl, sgd_fm, sgd_fh),
                ('Gradient Boosting Classifier', gbc_pl, gbc_pm, gbc_ph, gbc_rl, gbc_rm, gbc_rh, gbc_fl, gbc_fm, gbc_fh)]
In [381]:
score_performance = pd.DataFrame(data=models_score,
                                 columns=['Model', 'Precision Score Low Price', 'Precision Score Medium Price', 'Precision Score High Price',
                                          'Recall Score Low Price', 'Recall Score Medium Price', 'Recall Score High Price',
                                          'F1 Score Low Price', 'F1 Score Medium Price', 'F1 Score High Price'])
score_performance
Out[381]:
Model Precision Score Low Price Precision Score Medium Price Precision Score High Price Recall Score Low Price Recall Score Medium Price Recall Score High Price F1 Score Low Price F1 Score Medium Price F1 Score High Price
0 Naive Bayes 0.600000 0.673684 0.728682 0.514286 0.587156 0.854545 0.553846 0.627451 0.786611
1 Support Vector Classification 0.750000 0.770642 0.840708 0.685714 0.770642 0.863636 0.716418 0.770642 0.852018
2 Support Vector Classification with GridSearchCV 0.757576 0.792453 0.843478 0.714286 0.770642 0.881818 0.735294 0.781395 0.862222
3 Logistic Regression 0.842105 0.654135 0.803922 0.457143 0.798165 0.745455 0.592593 0.719008 0.773585
4 Logistic Regression with GridSearchCV 0.785714 0.693548 0.823529 0.628571 0.788991 0.763636 0.698413 0.738197 0.792453
5 AdaBoost Classifier 0.700000 0.704000 0.878788 0.600000 0.807339 0.790909 0.646154 0.752137 0.832536
6 K-Nearest Neighbors with GridSearchCV 0.750000 0.890000 0.912281 0.857143 0.816514 0.945455 0.800000 0.851675 0.928571
7 Decision Trees 0.810811 0.816514 0.870370 0.857143 0.816514 0.854545 0.833333 0.816514 0.862385
8 Decision Trees with GridSearchCV 0.725000 0.872093 0.812500 0.828571 0.688073 0.945455 0.773333 0.769231 0.873950
9 Random Forest Classifier 0.794118 0.845361 0.829268 0.771429 0.752294 0.927273 0.782609 0.796117 0.875536
10 Random Forest with RandomizedSearchCV 0.771429 0.851064 0.824000 0.771429 0.733945 0.936364 0.771429 0.788177 0.876596
11 Neural Network (48, 24, 12) 0.000000 0.429134 0.000000 0.000000 1.000000 0.000000 0.000000 0.600551 0.000000
12 Neural Network (500, 250) 0.789474 0.620690 0.822222 0.428571 0.825688 0.672727 0.555556 0.708661 0.740000
13 XGBoost Classifier 0.743590 0.870000 0.895652 0.828571 0.798165 0.936364 0.783784 0.832536 0.915556
14 Stochastic Gradient Descent 0.666667 0.648438 0.801802 0.285714 0.761468 0.809091 0.400000 0.700422 0.805430
15 Gradient Boosting Classifier 0.800000 0.850467 0.901786 0.800000 0.834862 0.918182 0.800000 0.842593 0.909910
In [382]:
models = [('Naive Bayes', nb_accuracy, nb_taccuracy, cv_nb_m),
          ('Support Vector Classification', svc_accuracy, svc_taccuracy, cv_svc_m),
          ('Support Vector Classification with GridSearchCV', svc_gscv_accuracy, svc_gscv_taccuracy, cv_svc_gscv_b),
          ('Logistic Regression', lr_accuracy, lr_taccuracy, cv_lr_m),
          ('Logistic Regression with GridSearchCV', lr_gs_accuracy, lr_gs_taccuracy, cv_lr_gs_b),
          ('AdaBoost Classifier', abc_accuracy, abc_taccuracy, cv_abc_m),
          ('K-Nearest Neighbors with GridSearchCV', knn_accuracy, knn_taccuracy, cv_knn_b),
          ('Decision Trees', dtc_accuracy, dtc_taccuracy, cv_dt_m),
          ('Decision Trees with GridSearchCV', dtc_pt_accuracy, dtc_pt_taccuracy, cv_dt_gscv_b),
          ('Random Forest Classifier', rfc_accuracy, rfc_taccuracy, cv_rfc_m),
          ('Random Forest with RandomizedSearchCV', rfc_rscv_accuracy, rfc_rscv_taccuracy, cv_rfc_rscv_b),
          ('XGBoost Classifier', xgbc_accuracy, xgbc_taccuracy, cv_xgbc_m),
          ('Stochastic Gradient Descent', sgd_accuracy, sgd_taccuracy, cv_sgd_m),
          ('Gradient Boosting Classifier', gbc_accuracy, gbc_taccuracy, cv_gbc_m),
          ('Neural Network (48, 24, 12)', mlp_accuracy, mlp_taccuracy, "None"),
          ('Neural Network (500, 250)', mlp2_accuracy, mlp2_taccuracy, "None")]
In [383]:
performance = pd.DataFrame(data=models, columns=['Model', 'Accuracy(Test Set)', 'Accuracy(Training Set)', 'Cross-Validation'])
performance
Out[383]:
Model Accuracy(Test Set) Accuracy(Training Set) Cross-Validation
0 Naive Bayes 0.692913 0.708498 0.702631
1 Support Vector Classification 0.799213 0.832016 0.821171
2 Support Vector Classification with GridSearchCV 0.811024 0.833992 0.827082
3 Logistic Regression 0.728346 0.752964 0.74207
4 Logistic Regression with GridSearchCV 0.755906 0.791502 0.786566
5 AdaBoost Classifier 0.771654 0.784585 0.774714
6 K-Nearest Neighbors with GridSearchCV 0.877953 1.000000 0.887352
7 Decision Trees 0.838583 1.000000 0.836993
8 Decision Trees with GridSearchCV 0.818898 0.850791 0.830052
9 Random Forest Classifier 0.830709 0.859684 0.830062
10 Random Forest with RandomizedSearchCV 0.826772 0.861660 0.835022
11 XGBoost Classifier 0.862205 1.000000 0.872578
12 Stochastic Gradient Descent 0.716535 0.726285 0.74107
13 Gradient Boosting Classifier 0.866142 0.974308 0.863667
14 Neural Network (48, 24, 12) 0.429134 0.442688 None
15 Neural Network (500, 250) 0.704724 0.733202 None
In [384]:
performance.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Model                   16 non-null     object 
 1   Accuracy(Test Set)      16 non-null     float64
 2   Accuracy(Training Set)  16 non-null     float64
 3   Cross-Validation        16 non-null     object 
dtypes: float64(2), object(2)
memory usage: 640.0+ bytes
In [385]:
performance['Cross-Validation'][:14]
Out[385]:
0     0.702631
1     0.821171
2     0.827082
3      0.74207
4     0.786566
5     0.774714
6     0.887352
7     0.836993
8     0.830052
9     0.830062
10    0.835022
11    0.872578
12     0.74107
13    0.863667
Name: Cross-Validation, dtype: object
In [386]:
performance['Cross-Validation'] = performance['Cross-Validation'][:14].astype('float64')
In [387]:
f, axe = plt.subplots(1, 1, figsize=(10, 6))
performance.sort_values(by=['Cross-Validation'][:14], ascending=False, inplace=True)
sns.barplot(x='Cross-Validation', y='Model', data=performance[:14], ax=axe)
axe.set_xlabel('Cross-Validaton Score', size=14)
axe.set_ylabel('Models', size=14)
axe.set_xlim(0, 1.0)
axe.set_xticks(np.arange(0, 1.1, 0.1))
# plt.title("Cross-Validaton Score Plot")
plt.tight_layout()
In [388]:
f, axes = plt.subplots(2, 1, figsize=(12, 10))

performance.sort_values(by=['Accuracy(Training Set)'], ascending=False, inplace=True)
sns.barplot(x='Accuracy(Training Set)', y='Model', data=performance, palette='Blues_d', ax=axes[0])
axes[0].set_xlabel('Accuracy (Training Set)', size=14)
axes[0].set_ylabel('Model', size=14)
axes[0].set_xlim(0, 1.0)
axes[0].set_xticks(np.arange(0, 1.1, 0.1))

performance.sort_values(by=['Accuracy(Test Set)'], ascending=False, inplace=True)
sns.barplot(x='Accuracy(Test Set)', y='Model', data=performance, palette='Reds_d', ax=axes[1])
axes[1].set_xlabel('Accuracy (Test Set)', size=14)
axes[1].set_ylabel('Model', size=14)
axes[1].set_xlim(0, 1.0)
axes[1].set_xticks(np.arange(0, 1.1, 0.1))

# plt.title("Accuracy Plot")
plt.tight_layout()
In [389]:
# Sorted based on Accuracy(Test Set)
performance.sort_values(by=['Accuracy(Test Set)'], ascending=False, inplace=True)
performance
Out[389]:
Model Accuracy(Test Set) Accuracy(Training Set) Cross-Validation
6 K-Nearest Neighbors with GridSearchCV 0.877953 1.000000 0.887352
13 Gradient Boosting Classifier 0.866142 0.974308 0.863667
11 XGBoost Classifier 0.862205 1.000000 0.872578
7 Decision Trees 0.838583 1.000000 0.836993
9 Random Forest Classifier 0.830709 0.859684 0.830062
10 Random Forest with RandomizedSearchCV 0.826772 0.861660 0.835022
8 Decision Trees with GridSearchCV 0.818898 0.850791 0.830052
2 Support Vector Classification with GridSearchCV 0.811024 0.833992 0.827082
1 Support Vector Classification 0.799213 0.832016 0.821171
5 AdaBoost Classifier 0.771654 0.784585 0.774714
4 Logistic Regression with GridSearchCV 0.755906 0.791502 0.786566
3 Logistic Regression 0.728346 0.752964 0.742070
12 Stochastic Gradient Descent 0.716535 0.726285 0.741070
15 Neural Network (500, 250) 0.704724 0.733202 NaN
0 Naive Bayes 0.692913 0.708498 0.702631
14 Neural Network (48, 24, 12) 0.429134 0.442688 NaN
In [390]:
# Sorted based on Accuracy(Training Set)
performance.sort_values(by=['Accuracy(Training Set)'], ascending=False, inplace=True)
performance
Out[390]:
Model Accuracy(Test Set) Accuracy(Training Set) Cross-Validation
6 K-Nearest Neighbors with GridSearchCV 0.877953 1.000000 0.887352
11 XGBoost Classifier 0.862205 1.000000 0.872578
7 Decision Trees 0.838583 1.000000 0.836993
13 Gradient Boosting Classifier 0.866142 0.974308 0.863667
10 Random Forest with RandomizedSearchCV 0.826772 0.861660 0.835022
9 Random Forest Classifier 0.830709 0.859684 0.830062
8 Decision Trees with GridSearchCV 0.818898 0.850791 0.830052
2 Support Vector Classification with GridSearchCV 0.811024 0.833992 0.827082
1 Support Vector Classification 0.799213 0.832016 0.821171
4 Logistic Regression with GridSearchCV 0.755906 0.791502 0.786566
5 AdaBoost Classifier 0.771654 0.784585 0.774714
3 Logistic Regression 0.728346 0.752964 0.742070
15 Neural Network (500, 250) 0.704724 0.733202 NaN
12 Stochastic Gradient Descent 0.716535 0.726285 0.741070
0 Naive Bayes 0.692913 0.708498 0.702631
14 Neural Network (48, 24, 12) 0.429134 0.442688 NaN